1. Filter primary dataset to generate proxy-specific subsets
First, import the primary dataset that was generated in Appendix
1.
load("Filtered.trace.toc.full.20230205.RData")
## Warning in load("Filtered.trace.toc.full.20230205.RData"): strings not
## representable in native encoding will be translated to UTF-8
Load required packages.
library(dplyr)
library(deeptime)
library(ggplot2)
library(randomForest)
library(doParallel)
library(foreach)
library(progressr)
library(stringi)
library(ggcorrplot)
#handlers(global = TRUE)
Update base of the Cryogenian in deeptime.
periods.edit <- deeptime::periods
periods.edit[14,2] <- 720
periods.edit[15,3] <- 720
periods.edit$color[13:15] <- c("#FED96A", "#FECC5C", "#FEBF4E")
Update non-latin characters for saving files
trace.toc.full<- filter(trace.toc.full, !(section.name == "Avalon-2"))
trace.toc.full$section.name <- stri_trans_general(trace.toc.full$section.name, "latin-ascii")
Categorical variables as factors
It will be useful for the treatment of samples with partial data that
categorical geological context variables are factor objects, not
character objects. We therefore assign all categorical variables to be
factors now.
trace.toc.full$site.type <- factor(trace.toc.full$site.type)
trace.toc.full$metamorphic.bin <- factor(trace.toc.full$metamorphic.bin)
trace.toc.full$basin.type <- factor(trace.toc.full$basin.type)
trace.toc.full$environmental.bin <- factor(trace.toc.full$environmental.bin)
trace.toc.full$lithology.name <- factor(trace.toc.full$lithology.name)
We then filter this primary dataset to generate the specific
subdatasets used in our primary analyses of different proxies. We
further calculate the number of rows in each of these subset
dataframes.
Molybdenum
For our primary Mo random forest analyses, we are interested in
samples deposited in anoxic depositional environments that also have
iron speciation (specifically Fepy/FeHR) data. We therefore filter the
primary dataset to include only samples that are classified as anoxic
based upon the iron speciation proxy (samples without iron speciation
data that would be classified as anoxic based on Fe/Al will not have the
require variables for the primary random forest analyses). We also
remove any samples with no Mo, TOC and/or Fepy/FeHR data.
Mo.anox.py.rf <- trace.toc.full %>%
filter(!is.na(Mo..ppm.) & !is.na(Fe.py.FeHR) & !is.na(TOC..wt..)) %>%
filter(FeHR.FeT >= 0.38)
nrow(Mo.anox.py.rf)
## [1] 2355
Uranium
For our primary U random forest analyses, we are only interested in
samples deposited in anoxic (ferruginous or euxinic) depositional
environments. We therefore filter the primary dataset to include only
samples that are classified as anoxic based upon iron speciation or
Fe/Al ratios. We also remove any samples with no U and/or TOC data.
U.anox.rf <- trace.toc.full %>%
filter(!is.na(U..ppm.) & !is.na(TOC..wt..)) %>%
filter(FeHR.FeT >= 0.38 | FeT.Al >= 0.53)
nrow(U.anox.rf)
## [1] 3409
Proportion euxinic
For our primary random forest analyses of the proportion of samples
that are euxinic, we are only interested in samples deposited in anoxic
(ferruginous or euxinic) depositional environments. Given that we use
iron speciation to determine the proportion of euxinic samples (and
therefore all samples in this analyses must have full iron speciation
data), we therefore filter the primary dataset to include only samples
that are classified as anoxic based upon iron speciation data. We also
remove samples with no Fepy/FeHR data, as with the other analyses
(although the requirement of the filtering step to have FeHR/FeT data
should achieve this as no samples should have partial iron speciation
data).
Fepy.anox.rf <- trace.toc.full %>%
filter(!is.na(Fe.py.FeHR)) %>%
filter(FeHR.FeT >= 0.38)
nrow(Fepy.anox.rf)
## [1] 3909
For the analysis of the proportion of euxinic samples, we also need
to code samples based upon whether they are euxinic (based on iron
speciation) in a binary fashion, following Sperling et al. (2015,
Nature).
Fepy.anox.rf$euxinic.Fe[Fepy.anox.rf$Fe.py.FeHR >= 0.7] <- 1
Fepy.anox.rf$euxinic.Fe[Fepy.anox.rf$Fe.py.FeHR < 0.7] <- 0
Total organic carbon
For our primary random forest analyses of total organic carbon we use
no redox filter. We just remove samples with no TOC data.
TOC.all.rf <- trace.toc.full %>%
filter(!is.na(TOC..wt..))
nrow(TOC.all.rf)
## [1] 12503
1b. Generate and plot cross-correlation matrices for all
datasets.
Start by generating matrices. Need to just identify numerical
variables and remove NAs
Mo.anox.py.rf.num <- Mo.anox.py.rf %>% select(site.latitude,
site.longitude,
interpreted.age,
Mo..ppm.,
TOC..wt..,
Fe.py.FeHR,
Al..wt..) %>%
na.omit()
Mo.anox.py.corr <- round(cor(Mo.anox.py.rf.num), 1)
U.anox.rf.num <- U.anox.rf %>% select(site.latitude,
site.longitude,
interpreted.age,
U..ppm.,
TOC..wt..,
Al..wt..) %>%
na.omit()
U.anox.corr <- round(cor(U.anox.rf.num), 1)
Fepy.anox.rf.num <- Fepy.anox.rf %>% select(site.latitude,
site.longitude,
interpreted.age,
euxinic.Fe,
TOC..wt..,
Al..wt..) %>%
na.omit()
Fepy.anox.corr <- round(cor(Fepy.anox.rf.num), 1)
TOC.all.rf.num <- TOC.all.rf %>% select(site.latitude,
site.longitude,
interpreted.age,
TOC..wt..,
Al..wt..) %>%
na.omit()
TOC.all.corr <- round(cor(TOC.all.rf.num), 1)
Mo.anox.py.corr.plot <- ggcorrplot(Mo.anox.py.corr, hc.order = TRUE, type = "lower", lab = TRUE, ggtheme = ggplot2::theme_void, colors = c("#6D9EC1", "white", "#E46726"))
ggsave("Figure Sx cross correlation between variables of Monte Carlo random forest datasets 20240207 100 iterations Mo.pdf", height=5, width=6)
U.anox.corr.plot <- ggcorrplot(U.anox.corr, hc.order = TRUE, type = "lower", lab = TRUE, ggtheme = ggplot2::theme_void, colors = c("#6D9EC1", "white", "#E46726"))
ggsave("Figure Sx cross correlation between variables of Monte Carlo random forest datasets 20240207 100 iterations U.pdf", height=5, width=6)
Fepy.anox.corr.plot <- ggcorrplot(Fepy.anox.corr, hc.order = TRUE, type = "lower", lab = TRUE, ggtheme = ggplot2::theme_void, colors = c("#6D9EC1", "white", "#E46726"))
ggsave("Figure Sx cross correlation between variables of Monte Carlo random forest datasets 20240207 100 iterations Fepy.pdf", height=5, width=6)
TOC.all.corr.plot <- ggcorrplot(TOC.all.corr, hc.order = TRUE, type = "lower", lab = TRUE, ggtheme = ggplot2::theme_void, colors = c("#6D9EC1", "white", "#E46726"))
ggsave("Figure Sx cross correlation between variables of Monte Carlo random forest datasets 20240207 100 iterations TOC.pdf", height=5, width=6)
# corr.sum <- ggarrange2(Mo.anox.py.corr.plot,
# U.anox.corr.plot,
# Fepy.anox.corr.plot,
# TOC.all.corr.plot,
# ncol=2)
#
# ggsave("Figure Sx cross correlation between variables of Monte Carlo random forest datasets 20240207 100 iterations.pdf", corr.sum, height=15, width=17)
2. Treatment of samples with partial data
Age uncertainty
We define a function to assign age uncertainties to samples with
missing maximum and minimum age estimates (i.e. only interpreted age).
We add ±5Myrs uncertainty on Phanerozoic ages and ±12.5Myrs on
Neoproterozoic ages as default.
age.unc <- function(data, Phanerozoic = 5, Proterozoic = 12.5){
# select which of the vector c(5,12.5) is appropriate by identifying which of the samples without min/max age have an interpreted age greater than 541. If TRUE, adding 1 to "TRUE" (1) gives 2 (i.e. 12.5Myr error from the vector c(5,12.5)). Adding 1 to "FALSE" (0) gives 1 (i.e. 5Myr error from the vector c(5,12.5)).
data$min.age[is.na(data$min.age)] <- data$interpreted.age[is.na(data$min.age)] - c(5,12.5)[(data$interpreted.age[is.na(data$min.age)] > 541) + 1]
data$max.age[is.na(data$max.age)] <- data$interpreted.age[is.na(data$max.age)] + c(5,12.5)[(data$interpreted.age[is.na(data$max.age)] > 541) + 1]
return(data)
}
Confirm that age assignment function works as intended using primary
Mo subsdataset as an example. Here we look only at the samples with
missing ages by filtering the dataset.
Mo.anox.py.rf.missing.ages <- Mo.anox.py.rf %>%
filter(is.na(min.age) & is.na(max.age))
Mo.anox.py.rf.missing.ages <- age.unc(data = Mo.anox.py.rf.missing.ages)
Are all Phanerozoic samples that had missing max and min ages in the
middle of their assigned age uncertainty?
Generate summary of median of age uncertainty minus interpreted age
for newly assigned Phanerozoic samples.
summary(
rowMeans(
cbind(Mo.anox.py.rf.missing.ages$max.age[Mo.anox.py.rf.missing.ages$interpreted.age <= 541], Mo.anox.py.rf.missing.ages$min.age[Mo.anox.py.rf.missing.ages$interpreted.age <= 541])) -
Mo.anox.py.rf.missing.ages$interpreted.age[Mo.anox.py.rf.missing.ages$interpreted.age <= 541]
)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.684e-14 0.000e+00 0.000e+00 5.812e-17 0.000e+00 5.684e-14
All values are zero within the precision of R (tiny deviations are
floating point errors).
Do all Phanerozoic samples that had missing max and min ages have 10
Myr age uncertainty? Generate summary of age uncertainty for newly
assigned Phanerozoic samples.
summary(Mo.anox.py.rf.missing.ages$max.age[Mo.anox.py.rf.missing.ages$interpreted.age <= 541] - Mo.anox.py.rf.missing.ages$min.age[Mo.anox.py.rf.missing.ages$interpreted.age <= 541])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10 10 10 10 10 10
All values are 10Myrs.
Are all Neoproterozoic samples that had missing max and min ages in
the middle of their assigned age uncertainty?
Generate summary of median of age uncertainty minus interpreted age
for newly assigned Neoproterozoic samples.
summary(
rowMeans(
cbind(Mo.anox.py.rf.missing.ages$max.age[Mo.anox.py.rf.missing.ages$interpreted.age > 541], Mo.anox.py.rf.missing.ages$min.age[Mo.anox.py.rf.missing.ages$interpreted.age > 541])) -
Mo.anox.py.rf.missing.ages$interpreted.age[Mo.anox.py.rf.missing.ages$interpreted.age > 541]
)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
All values are zero.
Do all Neoproterozoic samples that had missing max and min ages have
25 Myr age uncertainty? Generate summary of age uncertainty for newly
assigned Neoproterozoic samples.
summary(Mo.anox.py.rf.missing.ages$max.age[Mo.anox.py.rf.missing.ages$interpreted.age > 541] - Mo.anox.py.rf.missing.ages$min.age[Mo.anox.py.rf.missing.ages$interpreted.age > 541])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25 25 25 25 25 25
All values are 25Myrs.
Context data
We define a function to randomly assign values for geologic context
variables used in random forest models in cases where a sample is
missing data for that variable. This function is then used in the Monte
Carlo approach detailed below to maximize the number of samples used and
evaluate uncertainty associated with missing data by repeating the
random assignments and statistical learning models n times per analysis
(n = 100 times in this study).
partial.context <- function(data,
site.type,
metamorphic.bin,
basin.type,
site.latitude,
site.longitude,
environmental.bin,
lithology.name){
# Note that this function currently only randomly generates data from factor levels within the dataset "data" when generating data for categorical variables. Could hard code although all levels seem to be present in most if not all datasets.
# Each context variable must be set to "TRUE" (e.g. site.type = TRUE) inside function in order to be randomly assigned in the function partial.context().
# levels check - make sure that numerical identification of empty factor name (i.e. missing data) is correct (just in case of changes in SGP data download structure)
levels.check.total <- 0 # total number of context variables levels are checked for
levels.check.true <- 0 # number of levels that missing name is correctly identified
if(site.type == TRUE){
# levels check (see above)
levels.check.total <- levels.check.total +1
if(levels(data$site.type)[1] == ""){
levels.check.true <- levels.check.true + 1
}
# random assignment
data$site.type[data$site.type == "cuttings"] <- "core" ## Combine core and cuttings
data$site.type <- factor(data$site.type) ## omit unused factor levels
data$site.type[data$site.type == ""] <- sample(x = levels(data$site.type)[2:nlevels(data$site.type)], size = nrow(filter(data, site.type == "")), replace=TRUE) ## assign all empty cells one of "core" or "outcrop". Starting at level 2 omits cells with no data from vector to sample.
data$site.type <- factor(data$site.type) ## omit unused factor levels
}
if(metamorphic.bin == TRUE){
# levels check (see above)
levels.check.total <- levels.check.total +1
if(levels(data$metamorphic.bin)[1] == ""){
levels.check.true <- levels.check.true + 1
}
# random assignment
data$metamorphic.bin[data$metamorphic.bin == ""] <- sample(x = levels(data$metamorphic.bin)[2:nlevels(data$metamorphic.bin)], size = nrow(filter(data, metamorphic.bin == "")), replace=TRUE) ## assign all empty cells one of "Anchizone", "Diagenetic zone" or "Epizone". Starting at level 2 omits cells with no data from vector to sample.
data$metamorphic.bin <- factor(data$metamorphic.bin) ## omit unused factor levels
}
if(basin.type == TRUE){
# levels check (see above)
levels.check.total <- levels.check.total +1
if(levels(data$basin.type)[1] == ""){
levels.check.true <- levels.check.true + 1
}
# random assignment
data$basin.type[data$basin.type == ""] <- sample(x = levels(data$basin.type)[2:nlevels(data$basin.type)], size = nrow(filter(data, basin.type == "")), replace=TRUE) ## assign all empty cells one of the other basin types in the dataset. Starting at level 2 omits cells with no data from vector to sample.
data$basin.type <- factor(data$basin.type) ## omit unused factor levels
}
if(site.latitude == TRUE){
data$site.latitude[is.na(data$site.latitude)] <- runif(nrow(filter(data, is.na(site.latitude))), -90, 90) ## assign all empty cells a random latitude from a uniform distribution between -90 and 90 degrees.
}
if(site.longitude == TRUE){
data$site.longitude[is.na(data$site.longitude)] <- runif(nrow(filter(data, is.na(site.longitude))), -180, 180) ## assign all empty cells a random longitude from a uniform distribution between -180 and 180 degrees.
}
if(environmental.bin == TRUE){
# levels check (see above)
levels.check.total <- levels.check.total +1
if(levels(data$environmental.bin)[1] == ""){
levels.check.true <- levels.check.true + 1
}
# random assignment
data$environmental.bin[data$environmental.bin == ""] <- sample(x = levels(data$environmental.bin)[2:nlevels(data$environmental.bin)], size = nrow(filter(data, environmental.bin == "")), replace=TRUE) ## assign all empty cells one of the other environmental bins in the dataset. Starting at level 2 omits cells with no data from vector to sample.
data$environmental.bin <- factor(data$environmental.bin) ## omit unused factor levels
}
if(lithology.name == TRUE){
# levels check (see above)
levels.check.total <- levels.check.total +1
if(levels(data$lithology.name)[1] == ""){
levels.check.true <- levels.check.true + 1
}
# random assignment
data$lithology.name[data$lithology.name == ""] <- sample(x = levels(data$lithology.name)[2:nlevels(data$lithology.name)], size = nrow(filter(data, lithology.name == "")), replace=TRUE) ## assign all empty cells one of the other lithologies in the dataset. Starting at level 2 omits cells with no data from vector to sample.
data$lithology.name <- factor(data$lithology.name) ## omit unused factor levels
}
# Check that the level with no factor name has been correctly identified (this should always be true but is worth checking, especially if applied to different data). Round is used to accommodate numbers very close to 1 due to possible floating point errors (rounded to 3 decimal places).
if(round((levels.check.true/levels.check.total), digits = 3) == 1){
print("Partial context randomly assigned correctly - missing data identified correctly")
}else{
print("ERROR - missing data NOT identified correctly")
}
return(data)
}
Test function on subdataset for primary Mo analyses.
Summary before partial.context function.
summary(Mo.anox.py.rf)
## sample.identifier sample.original.name height.depth section.name
## Min. : 9 Length:2355 Min. : -76.00 Length:2355
## 1st Qu.: 3038 Class :character 1st Qu.: 20.95 Class :character
## Median : 6795 Mode :character Median : 80.82 Mode :character
## Mean :19037 Mean : 554.01
## 3rd Qu.:13388 3rd Qu.: 252.40
## Max. :95921 Max. :10359.00
## NA's :92
## site.type site.latitude site.longitude basin.type
## : 0 Min. :-33.00 Min. :-140.94 rift :1318
## core : 725 1st Qu.: 28.49 1st Qu.:-135.62 passive margin : 345
## cuttings: 48 Median : 54.07 Median : -74.56 : 193
## outcrop :1582 Mean : 46.43 Mean : -25.64 intracratonic sag : 167
## 3rd Qu.: 65.87 3rd Qu.: 103.06 retro-arc foreland : 143
## Max. : 79.97 Max. : 139.00 peripheral foreland: 114
## NA's :33 NA's :33 (Other) : 75
## metamorphic.bin interpreted.age max.age min.age
## : 226 Min. :301.0 Min. :329.8 Min. :329.0
## Anchizone :1561 1st Qu.:425.6 1st Qu.:410.8 1st Qu.:404.0
## Diagenetic zone: 518 Median :503.0 Median :433.4 Median :430.5
## Epizone : 50 Mean :502.5 Mean :441.3 Mean :435.4
## 3rd Qu.:551.0 3rd Qu.:481.7 3rd Qu.:480.0
## Max. :830.0 Max. :850.0 Max. :811.5
## NA's :1605 NA's :1608
## long.stratigraphy.name stratigraphy.name environmental.bin
## Length:2355 Length:2355 : 20
## Class :character Class :character basinal :1730
## Mode :character Mode :character inner shelf: 179
## outer shelf: 426
##
##
##
## lithology.name Al..wt.. Fe..wt.. Mo..ppm.
## shale :1743 Min. : 0.090 Min. : 0.050 Min. : 0.000
## mudstone : 337 1st Qu.: 3.700 1st Qu.: 1.440 1st Qu.: 2.000
## siltstone : 139 Median : 5.500 Median : 2.420 Median : 6.537
## : 94 Mean : 5.742 Mean : 2.749 Mean : 28.400
## lime mudstone: 36 3rd Qu.: 7.690 3rd Qu.: 3.770 3rd Qu.: 29.607
## phosphorite : 6 Max. :22.790 Max. :38.500 Max. :499.000
## (Other) : 0 NA's :260 NA's :37
## U..ppm. FeHR.FeT Fe.py.FeHR FeT.Al
## Min. : 0.02 Min. :0.3800 Min. :0.0000 Min. : 0.0400
## 1st Qu.: 3.00 1st Qu.:0.5200 1st Qu.:0.2000 1st Qu.: 0.3400
## Median : 5.64 Median :0.6800 Median :0.5500 Median : 0.4700
## Mean : 11.51 Mean :0.6929 Mean :0.4881 Mean : 0.5732
## 3rd Qu.: 13.00 3rd Qu.:0.8300 3rd Qu.:0.7500 3rd Qu.: 0.6300
## Max. :295.29 Max. :6.5000 Max. :1.0000 Max. :34.0700
## NA's :426 NA's :254
## TOC..wt..
## Min. : 0.000
## 1st Qu.: 0.800
## Median : 2.020
## Mean : 2.979
## 3rd Qu.: 3.855
## Max. :31.280
##
Run function, then check summary after partial.context function.
Mo.anox.py.rf.partial.test <- partial.context(data = Mo.anox.py.rf,
site.type = TRUE,
metamorphic.bin = TRUE,
basin.type = TRUE,
site.latitude = TRUE,
site.longitude = TRUE,
environmental.bin = TRUE,
lithology.name = TRUE)
## [1] "Partial context randomly assigned correctly - missing data identified correctly"
summary(Mo.anox.py.rf.partial.test)
## sample.identifier sample.original.name height.depth section.name
## Min. : 9 Length:2355 Min. : -76.00 Length:2355
## 1st Qu.: 3038 Class :character 1st Qu.: 20.95 Class :character
## Median : 6795 Mode :character Median : 80.82 Mode :character
## Mean :19037 Mean : 554.01
## 3rd Qu.:13388 3rd Qu.: 252.40
## Max. :95921 Max. :10359.00
## NA's :92
## site.type site.latitude site.longitude basin.type
## core : 773 Min. :-83.55 Min. :-179.39 rift :1338
## outcrop:1582 1st Qu.: 28.29 1st Qu.:-135.62 passive margin : 371
## Median : 53.97 Median : -74.56 intracratonic sag : 194
## Mean : 45.85 Mean : -25.63 retro-arc foreland : 166
## 3rd Qu.: 65.87 3rd Qu.: 103.06 peripheral foreland: 141
## Max. : 89.10 Max. : 177.31 back-arc : 90
## (Other) : 55
## metamorphic.bin interpreted.age max.age min.age
## Anchizone :1629 Min. :301.0 Min. :329.8 Min. :329.0
## Diagenetic zone: 593 1st Qu.:425.6 1st Qu.:410.8 1st Qu.:404.0
## Epizone : 133 Median :503.0 Median :433.4 Median :430.5
## Mean :502.5 Mean :441.3 Mean :435.4
## 3rd Qu.:551.0 3rd Qu.:481.7 3rd Qu.:480.0
## Max. :830.0 Max. :850.0 Max. :811.5
## NA's :1605 NA's :1608
## long.stratigraphy.name stratigraphy.name environmental.bin
## Length:2355 Length:2355 basinal :1737
## Class :character Class :character inner shelf: 184
## Mode :character Mode :character outer shelf: 434
##
##
##
##
## lithology.name Al..wt.. Fe..wt.. Mo..ppm.
## shale :1749 Min. : 0.090 Min. : 0.050 Min. : 0.000
## mudstone : 345 1st Qu.: 3.700 1st Qu.: 1.440 1st Qu.: 2.000
## siltstone : 147 Median : 5.500 Median : 2.420 Median : 6.537
## lime mudstone: 43 Mean : 5.742 Mean : 2.749 Mean : 28.400
## metasiltstone: 11 3rd Qu.: 7.690 3rd Qu.: 3.770 3rd Qu.: 29.607
## argillite : 10 Max. :22.790 Max. :38.500 Max. :499.000
## (Other) : 50 NA's :260 NA's :37
## U..ppm. FeHR.FeT Fe.py.FeHR FeT.Al
## Min. : 0.02 Min. :0.3800 Min. :0.0000 Min. : 0.0400
## 1st Qu.: 3.00 1st Qu.:0.5200 1st Qu.:0.2000 1st Qu.: 0.3400
## Median : 5.64 Median :0.6800 Median :0.5500 Median : 0.4700
## Mean : 11.51 Mean :0.6929 Mean :0.4881 Mean : 0.5732
## 3rd Qu.: 13.00 3rd Qu.:0.8300 3rd Qu.:0.7500 3rd Qu.: 0.6300
## Max. :295.29 Max. :6.5000 Max. :1.0000 Max. :34.0700
## NA's :426 NA's :254
## TOC..wt..
## Min. : 0.000
## 1st Qu.: 0.800
## Median : 2.020
## Mean : 2.979
## 3rd Qu.: 3.855
## Max. :31.280
##
Check that different random datasets are being generated each
time.
Generate another test dataset and check they are different.
Mo.anox.py.rf.partial.test.2 <- partial.context(data = Mo.anox.py.rf,
site.type = TRUE,
metamorphic.bin = TRUE,
basin.type = TRUE,
site.latitude = TRUE,
site.longitude = TRUE,
environmental.bin = TRUE,
lithology.name = TRUE)
## [1] "Partial context randomly assigned correctly - missing data identified correctly"
For site type
summary(Mo.anox.py.rf.partial.test$site.type)
## core outcrop
## 773 1582
summary(Mo.anox.py.rf.partial.test.2$site.type)
## core outcrop
## 773 1582
For metamorphic bin:
summary(Mo.anox.py.rf.partial.test$metamorphic.bin)
## Anchizone Diagenetic zone Epizone
## 1629 593 133
summary(Mo.anox.py.rf.partial.test.2$metamorphic.bin)
## Anchizone Diagenetic zone Epizone
## 1640 600 115
For basin type:
summary(Mo.anox.py.rf.partial.test$basin.type)
## back-arc fore-arc intracratonic sag passive margin
## 90 30 194 371
## peripheral foreland retro-arc foreland rift wrench
## 141 166 1338 25
summary(Mo.anox.py.rf.partial.test.2$basin.type)
## back-arc fore-arc intracratonic sag passive margin
## 93 29 193 369
## peripheral foreland retro-arc foreland rift wrench
## 144 166 1336 25
For latitude:
summary(Mo.anox.py.rf.partial.test$site.latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -83.55 28.29 53.97 45.85 65.87 89.10
summary(Mo.anox.py.rf.partial.test.2$site.latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -88.74 28.29 54.07 45.88 65.87 88.67
For longitude:
summary(Mo.anox.py.rf.partial.test$site.longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -179.39 -135.62 -74.56 -25.63 103.06 177.31
summary(Mo.anox.py.rf.partial.test.2$site.longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -175.92 -135.62 -74.56 -25.28 103.06 172.99
For environmental bin:
summary(Mo.anox.py.rf.partial.test$environmental.bin)
## basinal inner shelf outer shelf
## 1737 184 434
summary(Mo.anox.py.rf.partial.test.2$environmental.bin)
## basinal inner shelf outer shelf
## 1737 184 434
For lithology:
summary(Mo.anox.py.rf.partial.test$lithology.name)
## argillite claystone dolomudstone lime mudstone metasiltstone
## 10 7 10 43 11
## mudstone oil shale pelite phosphorite shale
## 345 1 10 10 1749
## silt siltstone slate
## 8 147 4
summary(Mo.anox.py.rf.partial.test.2$lithology.name)
## argillite claystone dolomudstone lime mudstone metasiltstone
## 8 4 9 41 7
## mudstone oil shale pelite phosphorite shale
## 345 5 7 15 1752
## silt siltstone slate
## 7 147 8
Al imputation
We define a function to estimate the lithology of samples with no Al
concentration based upon lithology.
First, we make and save a lookup table including all samples in the
main dataset, summarizing Al concentrations by lithology by calculating
the median and the 25th and 75th percentiles (the bounds of the
interquartile range). We make extra single-row table for all samples
with identified lithologies.
Al.lookup <- trace.toc.full %>%
filter(!(lithology.name == "") & !(is.na(Al..wt..))) %>%
group_by(lithology.name) %>%
summarize(median = median(Al..wt.., na.rm = TRUE), perc.25 = quantile(Al..wt.., prob = 0.25, na.rm = T), perc.75 = quantile(Al..wt.., prob = 0.75, na.rm = T))
# make extra single-row table for all samples with identified lithologies
Al.lookup.all <- trace.toc.full %>%
filter(!(lithology.name == "") & !(is.na(Al..wt..))) %>%
summarize(lithology.name = "all identified samples", median = median(Al..wt.., na.rm = TRUE), perc.25 = quantile(Al..wt.., prob = 0.25, na.rm = T), perc.75 = quantile(Al..wt.., prob = 0.75, na.rm = T))
save(Al.lookup, Al.lookup.all, file ="Al.lookup_trace.toc.full.RData")
We then define the imputation function. For each sample with
identified lithology but missing [Al] data we randomly assign that
sample an [Al] value from a uniform distribution with the 25th and 75th
percentile [Al] concentrations for that lithology as the minimum and
maximum bounds. For samples with no identified lithology that are
missing [Al] data, we randomly assign that sample an [Al] value from a
uniform distribution with the 25th and 75th percentile [Al]
concentrations for all samples as the minimum and maximum bounds. Note
that in this dataset there is no Al data for “silt” samples - so we use
data for “siltstone” for “silt” (note this only impacts one sample, from
the Ediacaran Doushantuo Formation, that is only used in TOC
analyses).
Al.impute <- function(data){
load("Al.lookup_trace.toc.full.RData")
for(row in 1:nrow(data)){
if(is.na(data$Al..wt..[row]) == TRUE){
# If lithology is unidentified, sample from full distribution of identified samples
if(data$lithology.name[row] == ""){
data$Al..wt..[row] <- runif(1, Al.lookup.all$perc.25[Al.lookup.all$lithology.name == "all identified samples"], Al.lookup.all$perc.75[Al.lookup.all$lithology.name == "all identified samples"])
}else if(data$lithology.name[row] == "silt"){
# There is no Al data for "silt" - use data for "siltstone" for "silt" (note this only impacts one sample, from the Ediacaran Doushantuo Formation, that is only used in TOC analyses)
data$Al..wt..[row] <- runif(1, Al.lookup$perc.25[Al.lookup$lithology.name == "siltstone"], Al.lookup$perc.75[Al.lookup$lithology.name == "siltstone"])
}else {
# Otherwise (except in the case of the two exceptions above), sample from the lithology-specific Al distribution.
data$Al..wt..[row] <- runif(1, Al.lookup$perc.25[Al.lookup$lithology.name == data$lithology.name[row]], Al.lookup$perc.75[Al.lookup$lithology.name == data$lithology.name[row]])
}
}
}
return(data)
}
Test the Al imputation method. Are all of the imputed Al values
within the interquartile ranges established for their given lithology?
Use primary Mo dataset as an example.
# tag samples missing Al with identified lithologies
Mo.anox.py.rf.no.Al.test <- Mo.anox.py.rf
Mo.anox.py.rf.no.Al.test$missing.Al[is.na(Mo.anox.py.rf.no.Al.test$Al..wt..) == TRUE] <- TRUE
Mo.anox.py.rf.no.Al.test$missing.Al[is.na(Mo.anox.py.rf.no.Al.test$Al..wt..) == FALSE] <- FALSE
Mo.anox.py.rf.no.Al.test$missing.lith[Mo.anox.py.rf.no.Al.test$lithology.name == ""] <- TRUE
Mo.anox.py.rf.no.Al.test$missing.lith[Mo.anox.py.rf.no.Al.test$lithology.name != ""] <- FALSE
Mo.anox.py.rf.imputed.Al <- Al.impute(data = Mo.anox.py.rf.no.Al.test)
Mo.anox.py.rf.imputed.Al.no.Al.before <- filter(Mo.anox.py.rf.imputed.Al, (missing.Al == TRUE & missing.lith == FALSE))
Mo.anox.py.rf.imputed.Al.no.Al.before <- merge(Mo.anox.py.rf.imputed.Al.no.Al.before, Al.lookup, id = c(lithology.name), all.x = TRUE)
for(row in 1:nrow(Mo.anox.py.rf.imputed.Al.no.Al.before)){
Mo.anox.py.rf.imputed.Al.no.Al.before$test[row] <- between(Mo.anox.py.rf.imputed.Al.no.Al.before$Al..wt..[row], Mo.anox.py.rf.imputed.Al.no.Al.before$perc.25[row], Mo.anox.py.rf.imputed.Al.no.Al.before$perc.75[row])
}
summary(Mo.anox.py.rf.imputed.Al.no.Al.before$test)
## Mode TRUE
## logical 257
It is true for all samples in this test dataset that all of the
imputed [Al] values within the interquartile ranges established for
their given lithology.
3. Age model
We define a function to randomly assign an age to each sample from
within its prescribed age uncertainty. The age is assigned from a
uniform distribution bounded by the minimum and maximum ages. This
function is then used in the Monte Carlo approach detailed below to
evaluate uncertainty associated with geologic ages by repeating the
random assignments and statistical learning models n times per analysis
(n = 100 in the analyses of this study). In a few cases (82 samples in
trace.toc.full), the sample minimum and maximum assigned ages were
flipped by SGP contributors during data entry. We correct for samples
like this in the function. In even fewer cases (6 in trace.toc.full,
from the same two Cambrian formations as the flipped samples above)
there are samples where contributors have given samples the same max,
min and interpreted age. Those samples are assigned their interpreted
age.
age.model.basic <- function(data){
# Default method (vast majority of samples)
data$age.model[data$max.age > data$min.age] <- runif(n = nrow(filter(data, max.age > min.age)), min = data$min.age[data$max.age > data$min.age], max = data$max.age[data$max.age > data$min.age])
# Samples with flipped max and min (note that max and min are flipped in runif())
data$age.model[data$min.age > data$max.age] <- runif(n = nrow(filter(data, min.age > max.age)), min = data$max.age[data$min.age > data$max.age], max = data$min.age[data$min.age > data$max.age])
# Samples with completed age uncertainty but max, min and interpreted age the same
data$age.model[data$min.age == data$max.age] <- data$interpreted.age[data$min.age == data$max.age]
return(data)
}
Test age model function by confirming that all ages in age model are
within the age uncertainty for each sample, using the primary Mo
subdataset as an example.
Mo.anox.py.rf.age.model <- Mo.anox.py.rf
Mo.anox.py.rf.age.model <- age.unc(data = Mo.anox.py.rf.age.model)
Mo.anox.py.rf.age.model <- age.model.basic(data = Mo.anox.py.rf.age.model)
within.age.bounds <- as.character()
for(row in 1:nrow(Mo.anox.py.rf.age.model)){
within.age.bounds[row] <- between(Mo.anox.py.rf.age.model$age.model[row], Mo.anox.py.rf.age.model$min.age[row], Mo.anox.py.rf.age.model$max.age[row])
}
print("For how many samples is it true that the age model is within the age uncertainty bounds?")
## [1] "For how many samples is it true that the age model is within the age uncertainty bounds?"
summary(as.factor(within.age.bounds))
## TRUE
## 2355
Secondly, consider a different approach to age modeling that forces
all samples to be in the correct stratigraphic order for each
stratigraphic unit.
age.model <- function(data, iteration, dataset = "general-test", plot.strat = TRUE, run.w.rf, stepped.threshold = 2){
### Note that we have 6 options (under 3 main headers) for our age models.
# Option 1a: Heights and depths for all samples + age structure
# Option 1b: Heights and depths for all samples + no age structure
# Option 1c: Heights and depths for all samples + stepped age structure
# Option 2a: No measured section + age structure
# Option 2b: No measured section + no age structure
# Option 2c: No measured section + stepped age structure
# Option 3a: Single sample section + measured section
# Option 3b: Single sample section + no measured section
# summarise all sections to create list
sections.sum <- data %>%
group_by(section.name) %>%
tally()
if(run.w.rf == FALSE){
# set core/cuttings depth to -ve so can use same script for outcrop and cuttings
data$height.depth[data$site.type == "core" | data$site.type == "cuttings"] <- -data$height.depth[data$site.type == "core" | data$site.type == "cuttings"]
# Fix switched Cambrian ages.
# If relationship between min and max age is as expected (max age greater than min age), keep columns the same.
# If relationship is wrong way around, switch them
data <- transform(data, max.age = ifelse(max.age >= min.age, max.age, min.age), min.age = ifelse(max.age >= min.age, min.age, max.age))
}
for (row in 1:nrow(sections.sum)) {
section <- sections.sum$section.name[row]
#print(section) # print section name for troubleshooting if useful
# generate subset of the dataframe for specific section
section.data <- filter(data, section.name == section)
####################
#### SCENARIO 1 #### [and 3a]
####################
# If section has section height/depth (i.e. no NAs for section height/depth):
if(is.na(mean(section.data$height.depth, na.rm = FALSE)) == FALSE){
#####################
#### SCENARIO 3a ####
#####################
# If there is height/depth data and only one sample...
if(nrow(section.data) <2){
# Basic age model method
data$age.model[data$section.name == section] <- runif(n = nrow(filter(data, section.name == section)), min = data$min.age[data$section.name == section], max = data$max.age[data$section.name == section])
# Otherwise...
}else{
#####################
#### SCENARIO 1b ####
#####################
# If there are heights/depths entered but the ages are all the same:
# apply approach for no obvious age structure (range is less than or equal to 1e-6, used as an approximation of zero to avoid R issues with zeros)
if(diff(range(section.data$interpreted.age)) <= 1e-6){
# If there are heights but all of the interpreted ages are the same, assign uniform age model between randomly assigned minimum and maximum age with principle of superposition
min.height_depth.sample <- section.data[which.min(section.data$height.depth),]
max.height_depth.sample <- section.data[which.max(section.data$height.depth),]
min.section.age <- runif(1, min = max.height_depth.sample$min.age, max = max.height_depth.sample$max.age)
if(min.height_depth.sample$min.age > min.section.age){
max.section.age <- runif(1, min = min.height_depth.sample$min.age, max = min.height_depth.sample$max.age)
}else{
max.section.age <- runif(1, min = min.section.age, max = min.height_depth.sample$max.age)
}
modeled.section.duration <- max.section.age - min.section.age
min.section.height.depth <- min.height_depth.sample$height.depth
total.section.height.depth <- max.height_depth.sample$height.depth - min.height_depth.sample$height.depth
data$min.section.age[data$section.name == section] <- min.section.age
data$max.section.age[data$section.name == section] <- max.section.age
data$min.section.height.depth[data$section.name == section] <- min.section.height.depth
data$total.section.height.depth[data$section.name == section] <- total.section.height.depth
data$modeled.section.duration[data$section.name == section] <- modeled.section.duration
data$age.model[data$section.name == section] <- data$max.section.age[data$section.name == section] - ((data$height.depth[data$section.name == section] - data$min.section.height.depth[data$section.name == section])/data$total.section.height.depth[data$section.name == section])*data$modeled.section.duration[data$section.name == section]
#print("Troubleshooting: Option 1b")
# End of actively editing loop... 20230313 14.46
#### SCENARIOS 1a+c ####
}else{
# if interpreted section age ascends with height/depth, flip section
sign.heights.depths <- mean(sign(diff(section.data$height.depth)), na.rm =TRUE)
sign.interpreted.ages <- mean(sign(diff(section.data$interpreted.age)), na.rm =TRUE)
# We anticipate ages to decrease as height increases, therefore signs (-1 or 1) of heights/depths and intepreted ages should be opposite
# therefore the product of the two signs should always be negative (-1)
# if they are not, then make heights/depths negative. This may not be an accurate representation of the geological section but that should not matter, this is simply for the age model calculations which will now proceed in the correct direction.
# note that we are testing for the majority of the section younging up here.
if(sign.heights.depths*sign.interpreted.ages > -0.999){
# check flipping will work and the section isn't just a total mess. If it will work (and flipping will make sign very nearly 1), flip
if(sign.heights.depths*sign.interpreted.ages > +0.999){
section.data$height.depth <- -section.data$height.depth
# also change in dataframe for plotting
data$height.depth[data$section.name == section] <- -data$height.depth[data$section.name == section]
}
}
# now, if ascending trend is still a total mess, do random sample between age uncertainty bounds. Need to check signs agan to do this.
sign.heights.depths <- mean(sign(diff(section.data$height.depth)), na.rm =TRUE)
sign.interpreted.ages <- mean(sign(diff(section.data$interpreted.age)), na.rm =TRUE)
if(sign.heights.depths*sign.interpreted.ages > -0.999){
data$age.model[data$section.name == section] <- runif(n = nrow(filter(data, section.name == section)), min = data$min.age[data$section.name == section], max = data$max.age[data$section.name == section])
}else{
#######
# check if interpretted age model is unnaturally stepped
# (i.e. multiple groups with all the same age - similar to scenario above)
age.group.sum <- section.data %>%
group_by(interpreted.age) %>%
tally()
# are all samples in this section listed in interpreted age chunks?
# RGS note - return to stepped age criteria?
#####################
#### SCENARIO 1c ####
#####################
if(nrow(age.group.sum) <= (nrow(section.data)/stepped.threshold)){
# compute age groups - do in reverse order so still go in age order up section
age.groups <- age.group.sum$interpreted.age[rev(order(age.group.sum$interpreted.age))]
# if(mean(age.group.sum$n) > 1){ removed for code development
for(age.group.no in 1:length(age.group.sum$n)){
#print(age.group.no)
age.group.interpreted.age <- as.numeric(age.groups[age.group.no])
if(age.group.no == 1){
age.group.data <- filter(section.data, interpreted.age == age.group.interpreted.age)
if(nrow(age.group.data) < 2){
# Basic age model method
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <-
runif(1, min = data$min.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age], max = data$max.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age])
}else{
min.height_depth.sample <- age.group.data[which.min(age.group.data$height.depth),]
max.height_depth.sample <- age.group.data[which.max(age.group.data$height.depth),]
# note that keep "section" in names rather than "age group" right now
min.section.age <- runif(1, min = max.height_depth.sample$min.age, max = max.height_depth.sample$max.age)
if(min.height_depth.sample$min.age > min.section.age){
max.section.age <- runif(1, min = min.height_depth.sample$min.age, max = min.height_depth.sample$max.age)
}else{
max.section.age <- runif(1, min = min.section.age, max = min.height_depth.sample$max.age)
}
modeled.section.duration <- max.section.age - min.section.age
min.section.height.depth <- min.height_depth.sample$height.depth
total.section.height.depth <- max.height_depth.sample$height.depth - min.height_depth.sample$height.depth
data$min.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- min.section.age
data$max.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- max.section.age
data$min.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- min.section.height.depth
data$total.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- total.section.height.depth
data$modeled.section.duration[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- modeled.section.duration
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- data$max.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] - ((data$height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] - data$min.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age])/data$total.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age])*data$modeled.section.duration[data$section.name == section & data$interpreted.age == age.group.interpreted.age]
}
}else{
age.group.below <- as.numeric(age.groups[age.group.no-1])
min.model.age.age.group.below <- min(data$age.model[data$section.name == section & data$interpreted.age == age.group.below])
age.group.data <- filter(section.data, interpreted.age == age.group.interpreted.age)
if(nrow(age.group.data) < 2){
# if only one sample in step then randomly select age
# Basic age model method - but checking min max age
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <-
runif(1, min = data$min.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age], max = min(c(min.model.age.age.group.below, data$max.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age])))
}else{
min.height_depth.sample <- age.group.data[which.min(age.group.data$height.depth),]
max.height_depth.sample <- age.group.data[which.max(age.group.data$height.depth),]
max.section.age <- runif(1, min = min.height_depth.sample$min.age, max = min(c(min.model.age.age.group.below, min.height_depth.sample$max.age)))
# note that keep "section" in names rather than "age group" right now
min.section.age <- runif(1, min = max.height_depth.sample$min.age, max = max.section.age)
# no need for this if loop used in non-chunked age models as it is always the case for age groups that the min.section.age will be younger than the minimum age of the sample at the stratigraphic top of the chunk (?) double check!
# if(min.height_depth.sample$min.age > min.section.age){
# max.section.age <- runif(1, min = min.height_depth.sample$min.age, max = min.height_depth.sample$max.age)
# }else{
# max.section.age <- runif(1, min = min.section.age, max = min.height_depth.sample$max.age)
# }
modeled.section.duration <- max.section.age - min.section.age
min.section.height.depth <- min.height_depth.sample$height.depth
total.section.height.depth <- max.height_depth.sample$height.depth - min.height_depth.sample$height.depth
data$min.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- min.section.age
data$max.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- max.section.age
data$min.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- min.section.height.depth
data$total.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- total.section.height.depth
data$modeled.section.duration[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- modeled.section.duration
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <- data$max.section.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age] - ((data$height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age] - data$min.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age])/data$total.section.height.depth[data$section.name == section & data$interpreted.age == age.group.interpreted.age])*data$modeled.section.duration[data$section.name == section & data$interpreted.age == age.group.interpreted.age]
}
}
#print("Troubleshooting: Option 1c")
}
}else{
#####################
#### SCENARIO 1a ####
#####################
min.height_depth.sample <- section.data[which.min(section.data$height.depth),]
max.height_depth.sample <- section.data[which.max(section.data$height.depth),]
# assume section has no unconformities (?)
min.section.interpreted.age <- max.height_depth.sample$interpreted.age
max.section.interpreted.age <- min.height_depth.sample$interpreted.age
interpreted.section.duration <- max.section.interpreted.age - min.section.interpreted.age
### THIS NOW SHOULDNT BE NECESSARY??? 20230802
# If section has been entered wrong and heights/depths do not decrease with age
# in this scenario min.height_depth.sample$interpreted.age <= max.height_depth.sample$interpreted.age
# default to just random ages.
# NOTE - alternatively could assume height numbers should be inverted as depths.
if(min.height_depth.sample$interpreted.age <= max.height_depth.sample$interpreted.age){
# Default method
data$age.model[data$section.name == section] <- runif(n = nrow(filter(data, section.name == section)), min = data$min.age[data$section.name == section], max = data$max.age[data$section.name == section])
# Alternative would remove the "else" statement below and set height/depth values to be negative
}else{
min.section.age <- runif(1, min = max.height_depth.sample$min.age, max = max.height_depth.sample$max.age)
# a couple of options...
# 1 - go with interpreted section duration relative to min age
# 2 - go with random min and max. If min age of max is lower than random age of min, use random age of min as min age of max
### OPTION 1 ###
#max.section.age <- min.section.age + abs(min.height_depth.sample$interpreted.age - max.height_depth.sample$interpreted.age)
### OPTION 2 ###
if(min.height_depth.sample$min.age > min.section.age){
max.section.age <- runif(1, min = min.height_depth.sample$min.age, max = min.height_depth.sample$max.age)
}else{
max.section.age <- runif(1, min = min.section.age, max = min.height_depth.sample$max.age)
}
modeled.section.duration <- max.section.age - min.section.age
data$min.section.age[data$section.name == section] <- min.section.age
data$max.section.age[data$section.name == section] <- max.section.age
data$modeled.section.duration[data$section.name == section] <- modeled.section.duration
data$interpreted.section.duration[data$section.name == section] <- interpreted.section.duration
data$min.section.interpreted.age[data$section.name == section] <- min.section.interpreted.age
#data$min.section.height[data$section.name == section] <- min.section.height
#data$max.section.height[data$section.name == section] <- max.section.height
# Used abs() here but now cores are negative should always be positive?
#data$measured.section.height[data$section.name == section] <- abs(max.section.height - min.section.height)
data$age.model[data$section.name == section] <- data$min.section.age[data$section.name == section] + ((data$interpreted.age[data$section.name == section] - data$min.section.interpreted.age[data$section.name == section])/data$interpreted.section.duration[data$section.name == section])*data$modeled.section.duration[data$section.name == section]
#print("Troubleshooting: Option 1a")
}
}
}
}
}
# make plots for sections with height/depth data
if(plot.strat == TRUE){
section.data.for.plot <- filter(data, section.name == section)
section <- sub("/", "-", section)
section <- sub(":", "-", section)
plot <- ggplot()+
geom_errorbarh(data = section.data.for.plot, aes(y = height.depth, xmin = min.age, xmax=max.age), colour = 'darkred', height = .5)+
geom_point(data = section.data.for.plot, aes(y = height.depth, x = age.model), shape=21, fill="grey70", size=6, alpha=0.8)+
theme_bw()+
scale_x_reverse()+
ylab(expression("Height/Depth"))+xlab(expression("Age Model (Ma)"))+
theme(plot.margin = ggplot2::margin(1,1,1,1,"cm"),panel.border = element_rect(fill=NA,color="black", linewidth=2,linetype="solid"),
axis.line = element_line(lineend = 'square'),
axis.ticks = element_line(linewidth=1),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=24),
legend.text = element_text( size=18),
axis.ticks.length = unit(5, "points"),
legend.justification=c(1,1), legend.position=c(.98,.36),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
dataset.file.name <- sub("..ppm.|..wt..", "", dataset)
mainDir <- getwd()
subDir <- paste("strat.model.plots/", dataset.file.name, sep = "")
subsubDir <- paste("/iteration-", iteration, sep = "")
dir.create(file.path(mainDir, subDir), showWarnings = FALSE)
dir.create(file.path(mainDir, subDir, subsubDir), showWarnings = FALSE)
setwd(file.path(mainDir, subDir, subsubDir))
ggsave(paste0(section ,".pdf"), plot, width=9.3, height=8.5, units="in")
setwd(file.path(mainDir))
}
}
####################
#### SCENARIO 2 #### [and 3b]
####################
# If section does not have section height/depth (i.e. at least some NAs for section height/depth):
if(is.na(mean(section.data$height.depth, na.rm = FALSE)) == TRUE){
#####################
#### SCENARIO 3b ####
#####################
# If there is only one sample...
if(nrow(section.data) <2){
# Basic age model method
data$age.model[data$section.name == section] <- runif(n = nrow(filter(data, section.name == section)), min = data$min.age[data$section.name == section], max = data$max.age[data$section.name == section])
#print("Troubleshooting: Option 3b")
# Otherwise...
}else{
#### SCENARIO 2 #### [and 3b]
if(diff(range(section.data$interpreted.age)) <= 1e-6){
#####################
#### SCENARIO 2b ####
#####################
# no obvious age structure (range is less than approx zero, 1e-6 used to avoid R issues with zeros)
# Default age model method
data$age.model[data$section.name == section] <- runif(n = nrow(filter(data, section.name == section)), min = data$min.age[data$section.name == section], max = data$max.age[data$section.name == section])
#print("Troubleshooting: Option 2b")
} else{
# check if interpretted age model is unnaturally stepped
# (i.e. multiple groups with all the same age - similar to scenario above)
age.group.sum <- section.data %>%
group_by(interpreted.age) %>%
tally()
if(nrow(age.group.sum) <= (nrow(section.data)/stepped.threshold)){
#####################
#### SCENARIO 2c ####
#####################
# compute age groups - do in reverse order so still go in age order up section
age.groups <- age.group.sum$interpreted.age[rev(order(age.group.sum$interpreted.age))]
# if(mean(age.group.sum$n) > 1){ removed for code development
for(age.group.no in 1:length(age.group.sum$n)){
#print(age.group.no)
age.group.interpreted.age <- as.numeric(age.groups[age.group.no])
if(age.group.no == 1){
age.group.data <- filter(section.data, interpreted.age == age.group.interpreted.age)
# Basic age model method
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <-
runif(nrow(age.group.data), min = data$min.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age], max = data$max.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age])
}else{
age.group.below <- as.numeric(age.groups[age.group.no-1])
min.model.age.age.group.below <- min(data$age.model[data$section.name == section & data$interpreted.age == age.group.below])
age.group.data <- filter(section.data, interpreted.age == age.group.interpreted.age)
# Basic age model method, but with min age from group below
data$age.model[data$section.name == section & data$interpreted.age == age.group.interpreted.age] <-
runif(nrow(age.group.data), min = data$min.age[data$section.name == section & data$interpreted.age == age.group.interpreted.age], max = min.model.age.age.group.below)
}
}
#print(section)
}else{
#####################
#### SCENARIO 2a ####
#####################
min.age.sample <- section.data[which.min(section.data$interpreted.age),]
max.age.sample <- section.data[which.max(section.data$interpreted.age),]
min.section.interpreted.age <- min.age.sample$interpreted.age
max.section.interpreted.age <- max.age.sample$interpreted.age
interpreted.section.duration <- max.section.interpreted.age - min.section.interpreted.age
min.section.age <- runif(1, min = min.age.sample$min.age, max = min.age.sample$max.age)
# a couple of options...
# 1 - go with interpreted section duration relative to min age
# 2 - go with random min and max. If min age of max is lower than random age of min, use random age of min as min age of max
### OPTION 1 ###
#max.section.age <- min.section.age + abs(max.age.sample$interpreted.age - min.age.sample$interpreted.age)
### OPTION 2 ###
if(max.age.sample$min.age > min.section.age){
max.section.age <- runif(1, min = max.age.sample$min.age, max = max.age.sample$max.age)
}else{
max.section.age <- runif(1, min = min.section.age, max = max.age.sample$max.age)
}
modeled.section.duration <- max.section.age - min.section.age
data$min.section.age[data$section.name == section] <- min.section.age
data$max.section.age[data$section.name == section] <- max.section.age
# Note that in this version, modeled and interpreted ages are different. In Option 1 version of age model, modeled and interpreted ages are the same value.
data$modeled.section.duration[data$section.name == section] <- modeled.section.duration
data$interpreted.section.duration[data$section.name == section] <- interpreted.section.duration
data$min.section.interpreted.age[data$section.name == section] <- min.section.interpreted.age
data$age.model[data$section.name == section] <- data$min.section.age[data$section.name == section] + ((data$interpreted.age[data$section.name == section] - data$min.section.interpreted.age[data$section.name == section])/data$interpreted.section.duration[data$section.name == section])*data$modeled.section.duration[data$section.name == section]
#print("Troubleshooting: Option 2a")
}
}
}
if(plot.strat == TRUE){
section.data.for.plot <- filter(data, section.name == section)
section <- sub("/", "-", section)
section <- sub(":", "-", section)
plot <- ggplot()+
geom_errorbarh(data = section.data.for.plot, aes(y = interpreted.age, xmin = min.age, xmax=max.age), colour = 'darkred', height = .5)+
geom_point(data = section.data.for.plot, aes(y = interpreted.age, x = age.model), shape=21, fill="grey70", size=6, alpha=0.8)+
theme_bw()+
scale_y_reverse()+scale_x_reverse()+
ylab(expression("Interpreted Age (Ma)"))+xlab(expression("Age Model (Ma)"))+
theme(plot.margin = ggplot2::margin(1,1,1,1,"cm"),panel.border = element_rect(fill=NA,color="black", linewidth=2,linetype="solid"),
axis.line = element_line(lineend = 'square'),
axis.ticks = element_line(linewidth=1),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=24),
legend.text = element_text( size=18),
axis.ticks.length = unit(5, "points"),
legend.justification=c(1,1), legend.position=c(.98,.36),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
dataset.file.name <- sub("..ppm.|..wt..", "", dataset)
mainDir <- getwd()
subDir <- paste("strat.model.plots/", dataset.file.name, sep = "")
subsubDir <- paste("/iteration-", iteration, sep = "")
dir.create(file.path(mainDir, subDir), showWarnings = FALSE)
dir.create(file.path(mainDir, subDir, subsubDir), showWarnings = FALSE)
setwd(file.path(mainDir, subDir, subsubDir))
ggsave(paste0(section ,".pdf"), plot, width=9.3, height=8.5, units="in")
setwd(file.path(mainDir))
}
}
# Going to have to then check if I have covered all options.
# Samples with completed age uncertainty but max, min and interpreted age the same
#data$age.model[data$min.age == data$max.age] <- data$interpreted.age[data$min.age == data$max.age]
}
return(data)
}
Check age model
iteration <- "test"
test.data <- age.unc(Mo.anox.py.rf)
test.data.w.age.model <- age.model(test.data, iteration = iteration, dataset = "Mo.test", plot.strat = TRUE, run.w.rf = FALSE)
# Additional age model test
iteration <- "test.2"
test.data.2 <- age.unc(Fepy.anox.rf)
test.data.w.age.model.2 <- age.model(test.data.2, iteration = iteration, dataset = "Fepy.test", plot.strat = TRUE, run.w.rf = FALSE)
4. Monte Carlo random forest function
We define a function to conduct n random forest model analyses (100
in main text of this study), with the aim of 1) generating partial
dependence plots for the variable of interest with respect to geologic
time when all other specified variables are held constant, 2) generating
variable importance analyses of all other specified variables, 3)
evaluating model uncertainties associated with partial context data and
geologic age uncertainties.
This function calls the functions age.unc(), Al.impute() and
partial.context() above if the options to run them are selected in the
function itself.
Monte.Carlo.rF <- function(data,
resp.var,
vars,
n,
run.age.unc,
run.partial.context,
run.Al.impute,
plot.strat = FALSE,
plot.strat.sum = FALSE,
run.w.rf = TRUE,
stepped.threshold = 2){
# initiate data frames
partial.plot.data <- data.frame(Age=double(), Var=double(), Iteration=double())
importance.data <- data.frame(Variable=double(), IncMSE=double(), IncNodePurity=double(), Iteration=double())
model.fit.data <- data.frame(best.nodesize=double(), best.mtry=double(), best.ntree=double(), best.MSE=double(), best.var=double())
# NOTE - moved this up on 20230222 - assume only have to do this once?
if(run.age.unc == TRUE){
data <- age.unc(data)
}
# set core/cuttings depth to -ve so can use same script for outcrop and cuttings - *used to be in age model*
data$height.depth[data$site.type == "core" | data$site.type == "cuttings"] <- -data$height.depth[data$site.type == "core" | data$site.type == "cuttings"]
# Fix Cambrian ages muck ups - *used to be in age model*
# If relationship between min and max age is as expected (max age greater than min age), keep columns the same.
# If relationship is wrong way around, switch them
data <- transform(data, max.age = ifelse(max.age >= min.age, max.age, min.age), min.age = ifelse(max.age >= min.age, min.age, max.age))
# If we're going to plot stratigraphic summary figures, need to initiate those figures.
if(plot.strat.sum == TRUE){
# summarise all sections to create list
sections.sum <- data %>%
group_by(section.name) %>%
tally()
dataset <- resp.var
for (row in 1:nrow(sections.sum)){
# select individual section (looped)
section <- sections.sum$section.name[row]
section.data.for.plot <- filter(data, section.name == section)
# Eliminate symbols that can't be saved in section plot file names (for plotting)
section <- sub("/", "-", section)
section <- sub(":", "-", section)
# Does section have height/depth data? If so (i.e. if is.na()=FALSE), initiate plot with age model on x axis and height/depth on y axis
if(is.na(mean(section.data.for.plot$height.depth, na.rm = FALSE)) == FALSE){
# check if height/depths need flipping. Skip sections with one sample.
if(nrow(section.data.for.plot) >1){
#print("Step 2")
# if interpreted section age ascends with height/depth, flip section
sign.heights.depths <- mean(sign(diff(section.data.for.plot$height.depth)), na.rm =TRUE)
sign.interpreted.ages <- mean(sign(diff(section.data.for.plot$interpreted.age)), na.rm =TRUE)
# We anticipate ages to decrease as height increases, therefore signs (-1 or 1) of heights/depths and intepreted ages should be opposite
# therefore the product of the two signs should always be negative (-1)
# if they are not, then make heights/depths negative. This may not be an accurate representation of the geological section but that should not matter, this is simply for the age model calculations which will now proceed in the correct direction.
# note that we are testing for the majority of the section younging up here.
if(sign.heights.depths*sign.interpreted.ages > -0.999){
# check flipping will work and the section isn't just a total mess. If it will work (and flipping will make sign very nearly 1), flip
if(sign.heights.depths*sign.interpreted.ages > +0.999){
section.data.for.plot$height.depth <- -section.data.for.plot$height.depth}
}
}
# In plot calls, reverse x and y and then use coord flip to make plotting age models easier.
# make base plot
plot <- ggplot()+
geom_errorbar(data = section.data.for.plot, aes(x = height.depth, ymin = min.age, ymax=max.age), colour = 'darkred', width = .5)+
geom_point(data = section.data.for.plot, aes(x = height.depth, y = interpreted.age), shape=21, fill="darkorange3", size=5, alpha=1) +
theme_bw()+
scale_y_reverse()+coord_flip()+
xlab(expression("Height/Depth"))+ylab(expression("Age Model (Ma)"))+
theme(plot.margin = ggplot2::margin(1,1,1,1,"cm"),panel.border = element_rect(fill=NA,color="black", linewidth=2,linetype="solid"),
axis.line = element_line(lineend = 'square'),
axis.ticks = element_line(linewidth=1),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=24),
legend.text = element_text( size=18),
axis.ticks.length = unit(5, "points"),
legend.justification=c(1,1), legend.position=c(.98,.36),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
# If there is only one sample in the section, label it as such.
if(nrow(section.data.for.plot) < 2){
plot <- plot + labs(title = "Option 3a: \nSingle sample section + measured section")+ theme(plot.title = element_text(size=18))
# Otherwise...
}else{
# Does the section have an age structure?
# if all of the interpreted ages are essentially the same (within 1 year), then the section has no interpreted age structure. Then label it as such.
if(diff(range(section.data.for.plot$interpreted.age)) <= 1e-6){
plot <- plot + labs(title = "Option 1b: \nHeights and depths for all samples + no age structure") + theme(plot.title = element_text(size=18))
} else{
# check if interpreted age model is unnaturally stepped
# (i.e. multiple groups with all the same age - similar to scenario above)
age.group.sum.for.plot <- section.data.for.plot %>%
group_by(interpreted.age) %>%
tally()
# If this section meets our criteria for a stepped age structure, label it as such.
# RGS note - could revist our criteria for stepped age structure.
if(nrow(age.group.sum.for.plot) <= (nrow(section.data.for.plot)/stepped.threshold)){
plot <- plot + labs(title = "Option 1c: \nHeights and depths for all samples + stepped age structure") + theme(plot.title = element_text(size=18))
# If this section has an age structure and it is not stepped, label it as such. Note, this is essentially our gold standard section.
}else{
plot <- plot + labs(title = "Option 1a: \nHeights and depths for all samples + age structure") + theme(plot.title = element_text(size=18))
}
}
}
}
# If the section does NOT have height/depth data (i.e. if is.na()=TRUE), initiate plot with age model on x axis and interpreted age on y axis
# In plot calls, reverse x and y and then use coord flip to make plotting age models easier.
if(is.na(mean(section.data.for.plot$height.depth, na.rm = FALSE)) == TRUE){
# make base plot
plot <- ggplot()+
geom_errorbar(data = section.data.for.plot, aes(x = interpreted.age, ymin = min.age, ymax=max.age), colour = 'darkred', width = .5)+ geom_point(data = section.data.for.plot, aes(x = interpreted.age, y = interpreted.age), shape=21, fill="darkorange3", size=5, alpha=1) +
theme_bw()+
scale_y_reverse()+scale_x_reverse()+coord_flip()+
xlab(expression("Interpreted Age (Ma)"))+ylab(expression("Age Model (Ma)"))+
theme(plot.margin = ggplot2::margin(1,1,1,1,"cm"),panel.border = element_rect(fill=NA,color="black", linewidth=2,linetype="solid"),
axis.line = element_line(lineend = 'square'),
axis.ticks = element_line(linewidth=1),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=24),
legend.text = element_text( size=18),
axis.ticks.length = unit(5, "points"),
legend.justification=c(1,1), legend.position=c(.98,.36),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
# if section only has one sample, label it as such.
if(nrow(section.data.for.plot) < 2){
plot <- plot + labs(title = "Option 3b: \nSingle sample section + no measured section") + theme(plot.title = element_text(size=18))
# Otherwise...
# Does the section have an age structure?
# if all of the interpreted ages are essentially the same (within 1 year), then the section has no interpreted age structure. Then label it as such.
}else{
if(diff(range(section.data.for.plot$interpreted.age)) <= 1e-6){
plot <- plot + labs(title = "Option 2b: \nNo measured section + no age structure") + theme(plot.title = element_text(size=18))
} else{
# check if interpretted age model is unnaturally stepped
# (i.e. multiple groups with all the same age - similar to scenario above)
age.group.sum.for.plot <- section.data.for.plot %>%
group_by(interpreted.age) %>%
tally()
# If this section meets our criteria for a stepped age structure, label it as such.
# RGS note - could revist our criteria for stepped age structure.
if(nrow(age.group.sum.for.plot) <= (nrow(section.data.for.plot)/stepped.threshold)){
plot <- plot + labs(title = "Option 2c: \nNo measured section + stepped age structure") + theme(plot.title = element_text(size=18))
}else{
# If this section has an age structure and it is not stepped, label it as such.
plot <- plot + labs(title = "Option 2a: \nNo measured section + age structure") + theme(plot.title = element_text(size=18))
}
}
}
}
assign(paste0("section-", section), plot)
dataset.file.name <- sub("..ppm.|..wt..", "", dataset)
mainDir <- getwd()
subDir <- paste("strat.model.plots/", dataset.file.name, sep = "")
subsubDir <- paste("/stratigraphic-model-summary", sep = "")
dir.create(file.path(mainDir, subDir), showWarnings = FALSE)
dir.create(file.path(mainDir, subDir, subsubDir), showWarnings = FALSE)
setwd(file.path(mainDir, subDir, subsubDir))
ggsave(paste0(section ,".pdf"), get(paste0("section-", section)), width=9.3, height=8.5, units="in")
setwd(file.path(mainDir))
}
}
###### START OF KEY LOOP ######
# initiate loop
# setting up a back end...
# how many cores do we have??
parallel::detectCores()
n.cores <- c((parallel::detectCores() - 1), n)[which.min(c((parallel::detectCores() - 1), n))]
#create the cluster
# NOTE - for final version should have options...
# e.g. - if local.unix == TRUE then type = "FORK", else type = "PSOCK"
my.cluster <- parallel::makeCluster(
n.cores,
type = "PSOCK" # PSOCK for iridis5 and RGS MacBook
#type = "FORK" # FORK for ubuntu machines?
)
#check cluster definition (optional)
print(my.cluster)
#register it to be used by %dopar%
doParallel::registerDoParallel(cl = my.cluster)
#check if it is registered (optional)
foreach::getDoParRegistered()
#how many workers are available? (optional)
foreach::getDoParWorkers()
rf.sum.par <- foreach(
iteration = 1:n,
.combine = 'c',
.packages = c("randomForest", "dplyr", "ggplot2")
) %dopar% {
source("SGP-functions-unannotated.R")
#for (iteration in 1:n){
# make new data frame for iteration.
data_it <- data
if(run.Al.impute == TRUE){
data_it <- Al.impute(data_it)
}
# Note that Al.impute should be run before partial.context so that samples without assigned lithology have Al imputed based on all identified samples (not their random assignment in this iteration).
if(run.partial.context == TRUE){
data_it <- partial.context(data_it,
site.type = TRUE,
metamorphic.bin = TRUE,
basin.type = TRUE,
site.latitude = TRUE,
site.longitude = TRUE,
environmental.bin = TRUE,
lithology.name = TRUE)
}
# Run age model for this iteration
data_it <- age.model(data_it, iteration = iteration, dataset = resp.var, plot.strat = plot.strat, run.w.rf = run.w.rf)
# if plotting stratigraphic summary plots, need to add the age models to our summary plots for each iteration.
if(plot.strat.sum == TRUE){
# summarise all sections to create list
sections.sum <- data %>%
group_by(section.name) %>%
tally()
dataset <- resp.var
# plot section by section
for (row in 1:nrow(sections.sum)){
section <- sections.sum$section.name[row]
section.data.for.plot <- filter(data_it, section.name == section)
section <- sub("/", "-", section)
section <- sub(":", "-", section)
# for sections with height/depths
# i.e. Scenarios 1a, 1b, 1c, 3a
if(is.na(mean(section.data.for.plot$height.depth, na.rm = FALSE)) == FALSE){
add.on.path <- geom_path(data = section.data.for.plot, aes(x = height.depth, y = age.model), color="grey5", size=.8, alpha=0.2)
add.on.point <- geom_point(data = section.data.for.plot, aes(x = height.depth, y = age.model), shape=15, color="grey40", size=3, alpha=0.2)
}
# for sections with no height/depths but interpreted ages
# i.e. Scenarios 2a, 2b, 2c, 3b
if(is.na(mean(section.data.for.plot$height.depth, na.rm = FALSE)) == TRUE){
add.on.path <- geom_path(data = section.data.for.plot, aes(x = interpreted.age, y = age.model), shape=21, color="grey5", size=.8, alpha=0.2)
add.on.point <- geom_point(data = section.data.for.plot, aes(x = interpreted.age, y = age.model), shape=15, color="grey40", size=3, alpha=0.2)
}
assign(paste0("section-", section), get(paste0("section-", section)) + add.on.path + add.on.point)
dataset.file.name <- sub("..ppm.|..wt..", "", dataset)
mainDir <- getwd()
subDir <- paste("strat.model.plots/", dataset.file.name, sep = "")
subsubDir <- paste("/stratigraphic-model-summary", sep = "")
dir.create(file.path(mainDir, subDir), showWarnings = FALSE)
dir.create(file.path(mainDir, subDir, subsubDir), showWarnings = FALSE)
setwd(file.path(mainDir, subDir, subsubDir))
ggsave(paste0(section ,".pdf"), get(paste0("section-", section)), width=9.3, height=8.5, units="in")
setwd(file.path(mainDir))
}
}
# select all of the variables named in the "vars" vector to be columns in the dataframe data.for.rF
data.for.rF <- select(data_it, all_of(vars))
# remove any rows that still contain NAs
data.for.rF <- na.omit(data.for.rF)
# define a formula that identifies the response variable defined in function and uses all other vars as model variables.
formula <- as.formula(paste(resp.var, "~ ."))
# Random forest function parameter notes
# ntree - Number of trees to grow. Default is 500 (used here)
# mtry=3 - Number of variables randomly sampled as candidates at each split. The default values are different for classification (sqrt(p) where p is number of variables in x) and regression (p/3). p/3 is therefore what is used here.
# importance=TRUE - used for generating variable importance plots.
# proximity=TRUE - used for generating proximity plots (we never do this in this study).
# na.action=na.omit - A function to specify the action to be taken if NAs are found. (NOTE: If given, this argument must be named.)
# Note that replace = TRUE by default.
########## ADDING IN HYPERPARAMETER OPTIMISATION ###########
### Trying a simple loop for grid search for hyperparameter optimization
train.Mo <- sample(1:nrow(data.for.rF), (nrow(data.for.rF)/3)*2) # # this time do it w 2/3... use 4/5 of the data for training
Mo.test <- data.for.rF[-train.Mo, resp.var]
#summary(tree.Mo)
ntree.vec <- 2^seq(0, 8, 1) # this is a reasonable range but check MSE is plateuing w increased ntree by the end
mtry.vec <- seq(2, 10, 2)
nodesize.vec.too.large <- 2^seq(1, 20, 1)
nodesize.vec <- nodesize.vec.too.large[nodesize.vec.too.large < (nrow(data.for.rF)/4)] # essentially saying we want at least 4 nodes?
MSE.sum <- c()
i <- 0
for(ntree.no in ntree.vec){
for(mtry.no in mtry.vec){
for(nodesize.no in nodesize.vec){
i <- i + 1
rf.Mo <- randomForest(formula, data = data.for.rF, subset = train.Mo, mtry = mtry.no, ntree=ntree.no, nodesize = nodesize.no, importance = TRUE)
rf.Mo
yhat.Mo.rf <- predict(rf.Mo, newdata = data.for.rF[-train.Mo, ])
plot(yhat.Mo.rf, Mo.test)
abline(0, 1)
MSE <- mean((yhat.Mo.rf - Mo.test)^2)
exp.var <- round(100 * rf.Mo$rsq[length(rf.Mo$rsq)], digits = 2)
MSE.sum$ntree.no[i] <- ntree.no
MSE.sum$mtry.no[i] <- mtry.no
MSE.sum$nodesize.no[i] <- nodesize.no
MSE.sum$MSE[i] <- MSE
MSE.sum$exp.var[i] <- exp.var
}
}
}
MSE.sum <- as.data.frame(MSE.sum)
best.row <- which.min(MSE.sum$MSE)
MSE.sum.best.row <- MSE.sum[best.row,]
best.nodesize <- MSE.sum$nodesize.no[best.row]
best.mtry <- MSE.sum$mtry.no[best.row]
best.ntree <- MSE.sum$ntree.no[best.row]
best.MSE <- MSE.sum$MSE[best.row]
best.var <- MSE.sum$exp.var[best.row]
model.fit.row <- cbind(best.nodesize,
best.mtry,
best.ntree,
best.MSE,
best.var
)
model.fit.data[1,] <- model.fit.row
ggplot(MSE.sum, aes(y = MSE, x = 1:nrow(MSE.sum)))+
geom_point(aes(colour = nodesize.no, size = mtry.no, alpha = ntree.no))+
annotate(geom="point", y = MSE.sum.best.row$MSE, x = best.row, colour = "red")+
theme_bw()+
scale_color_viridis_c()
dataset <- resp.var
dataset.file.name <- sub("..ppm.|..wt..", "", dataset)
mainDir <- getwd()
subDir <- paste("hyperparameter.plots/", dataset.file.name, sep = "")
dir.create(file.path(mainDir, subDir), showWarnings = FALSE)
setwd(file.path(mainDir, subDir))
ggsave(paste0("hyperparameter-validation-", iteration ,".pdf"), width=10, height=8.5, units="in")
ggplot(MSE.sum, aes(y = exp.var, x = 1:nrow(MSE.sum)))+
geom_point(aes(colour = nodesize.no, size = mtry.no, alpha = ntree.no))+
annotate(geom="point", y = best.var, x = best.row, colour = "red")+
theme_bw()+
scale_color_viridis_c()
ggsave(paste0("variance-hyperparameter-validation-", iteration ,".pdf"), width=10, height=8.5, units="in")
setwd(file.path(mainDir))
# suggested new equation model code:
it.rF <- randomForest(formula, data.for.rF, na.action=na.omit, subset = train.Mo, mtry = best.mtry, ntree = best.ntree, nodesize = best.nodesize, importance = TRUE)
it.partial.plot <- as.data.frame(partialPlot(it.rF, data.for.rF, age.model))
it.import <- as.data.frame(importance(it.rF))
it.import <- tibble::rownames_to_column(it.import, "Variable")
# NOTE - we do not use rbind() to combine random forest iterations in order to make this function more efficient. This slightly reduces readability but increases efficiency dramatically for high n values.
#partial.i <- nrow(partial.plot.data)+1
#partial.j <- nrow(partial.plot.data)+nrow(it.partial.plot)
#partial.plot.data[partial.i:partial.j, 1:2] <- it.partial.plot
#partial.plot.data[partial.i:partial.j, 3] <- rep(iteration, nrow(it.partial.plot))
# rbind version
names(it.partial.plot) <- c("Age", "Var")
it.partial.plot$Iteration <- rep(iteration, nrow(it.partial.plot))
#partial.plot.data <- rbind(partial.plot.data, it.partial.plot)
partial.plot.data <- it.partial.plot
#import.i <- nrow(importance.data)+1
#import.j <- nrow(importance.data)+nrow(it.import)
#importance.data[import.i:import.j, 1:3] <- it.import
#importance.data[import.i:import.j, 4] <- rep(iteration, nrow(it.import))
# rbind version
names(it.partial.plot) <- c("Variable", "IncMSE", "IncNodePurity")
it.import$Iteration <- rep(iteration, nrow(it.import))
#importance.data <- rbind(importance.data, it.import)
importance.data <- it.import
# Name variables in variable importance data nicely for plotting etc.
importance.data$Variable[importance.data$Variable == "age.model"] <- "Age model"
importance.data$Variable[importance.data$Variable == "site.type"] <- "Site type"
importance.data$Variable[importance.data$Variable == "metamorphic.bin"] <- "Metamorphic bin"
importance.data$Variable[importance.data$Variable == "basin.type"] <- "Basin type"
importance.data$Variable[importance.data$Variable == "site.latitude"] <- "Latitude"
importance.data$Variable[importance.data$Variable == "site.longitude"] <- "Longitude"
importance.data$Variable[importance.data$Variable == "lithology.name"] <- "Lithology"
importance.data$Variable[importance.data$Variable == "environmental.bin"] <- "Environmental bin"
if(("TOC..wt.." %in% vars == TRUE) & (resp.var != "TOC..wt..")){
importance.data$Variable[importance.data$Variable == "TOC..wt.."] <- "TOC"
}
if(("Fe.py.FeHR" %in% vars == TRUE) & (resp.var != "Fe.py.FeHR")){
importance.data$Variable[importance.data$Variable == "Fe.py.FeHR"] <- "Fepy/FeHR"
}
if(("Al..wt.." %in% vars == TRUE) & (resp.var != "Al..wt..")){
importance.data$Variable[importance.data$Variable == "Al..wt.."] <- "Al"
}
rf.sum.it <- list(partial.plot.data, importance.data, model.fit.data)
names(rf.sum.it) <- c("partial.plot.data", "importance.data", "model.fit.data")
return(rf.sum.it)
print(paste("Finished iteration", iteration))
}
#rf.sum <- list(partial.plot.data, importance.data)
#names(rf.sum) <- c("partial.plot.data", "importance.data")
parallel::stopCluster(cl = my.cluster)
# initiate frames
partial.plot.data <- data.frame(Age=double(), Var=double(), Iteration=double())
importance.data <- data.frame(Variable=double(), IncMSE=double(), IncNodePurity=double(), Iteration=double())
model.fit.data <- data.frame(best.nodesize=double(), best.mtry=double(), best.ntree=double(), best.MSE=double(), best.var=double())
# quick assimilation loop
for(frame in seq(1, length(rf.sum.par), 3)){
partial.plot.data <- rbind(partial.plot.data, as.data.frame(rf.sum.par[frame]))
importance.data <- rbind(importance.data, as.data.frame(rf.sum.par[frame+1]))
model.fit.data <- rbind(model.fit.data, as.data.frame(rf.sum.par[frame+2]))
}
names(partial.plot.data) <- c("Age", "Var", "Iteration")
names(importance.data) <- c("Variable", "IncMSE", "IncNodePurity", "Iteration")
names(model.fit.data) <- c("best.nodesize", "best.mtry", "best.ntree", "best.MSE", "best.var")
# lots of repetition of partial.plot.data and importance.data (renamed multiple times) - rename for tidiness? should be fine though, just not great practice.
rf.sum <- list(partial.plot.data, importance.data, model.fit.data)
names(rf.sum) <- c("partial.plot.data", "importance.data", "model.fit.data")
return(rf.sum)
}
Test Monte Carlo random forest function using primary Mo analyses as
an example, with 3 iterations.
test <- Monte.Carlo.rF(data = Mo.anox.py.rf,
resp.var = "Mo..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"Mo..ppm.",
"TOC..wt..",
"Fe.py.FeHR",
"Al..wt.."),
n = 3,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE,
plot.strat = FALSE)
## socket cluster with 3 nodes on host 'localhost'
summary(test)
## Length Class Mode
## partial.plot.data 3 data.frame list
## importance.data 4 data.frame list
## model.fit.data 5 data.frame list
summary(test$partial.plot.data)
## Age Var Iteration
## Min. :299.8 Min. :21.57 Min. :1
## 1st Qu.:434.7 1st Qu.:24.88 1st Qu.:1
## Median :568.5 Median :26.09 Median :2
## Mean :570.4 Mean :28.88 Mean :2
## 3rd Qu.:708.1 3rd Qu.:28.55 3rd Qu.:3
## Max. :842.8 Max. :45.98 Max. :3
summary(test$importance.data)
## Variable IncMSE IncNodePurity Iteration
## Length:33 Min. : 0.603 Min. : 26168 Min. :1
## Class :character 1st Qu.: 3.083 1st Qu.: 62027 1st Qu.:1
## Mode :character Median : 4.928 Median : 283097 Median :2
## Mean : 6.343 Mean : 381078 Mean :2
## 3rd Qu.: 7.215 3rd Qu.: 387821 3rd Qu.:3
## Max. :29.874 Max. :2287082 Max. :3
summary(test$model.fit.data)
## best.nodesize best.mtry best.ntree best.MSE
## Min. :2 Min. :2.000 Min. : 16.00 Min. : 908.4
## 1st Qu.:2 1st Qu.:3.000 1st Qu.: 40.00 1st Qu.: 908.7
## Median :2 Median :4.000 Median : 64.00 Median : 909.0
## Mean :2 Mean :3.333 Mean : 69.33 Mean :1020.0
## 3rd Qu.:2 3rd Qu.:4.000 3rd Qu.: 96.00 3rd Qu.:1075.7
## Max. :2 Max. :4.000 Max. :128.00 Max. :1242.4
## best.var
## Min. :57.26
## 1st Qu.:60.21
## Median :63.16
## Mean :62.59
## 3rd Qu.:65.25
## Max. :67.34
5. Running primary analyses
Set number of iterations. Default for main text analyses is n =
100.
n <- 100
Molybdenum
For our primary Mo analyses we include all key geologic context
variables (“site.type”, “metamorphic.bin”, “basin.type”,
“site.latitude”, “site.longitude”, “lithology.name”,
“environmental.bin”) as well as geologic age, TOC, Fepy/FeHR and [Al] in
our random forest model. We incorporate TOC and Fepy/FeHR to control for
the impacts of organic carbon and sulfide availability on Mo reduction
rates (note that only anoxic samples are used in these analyses). [Al]
is incorporated to control for detrital input.
Note that in all of these random forest function calls figures and
results are hidden to prevent extended printouts in the R markdown
files, but “fig.show=‘hide’” and “results=‘hide’” can be deleted to see
full output.
Mo.anox.py.rf.results <- Monte.Carlo.rF(data = Mo.anox.py.rf,
resp.var = "Mo..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"Mo..ppm.",
"TOC..wt..",
"Fe.py.FeHR",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
Mo.anox.py.rf.partial <- Mo.anox.py.rf.results$partial.plot.data
Mo.anox.py.rf.import <- Mo.anox.py.rf.results$importance.data
Mo.anox.py.rf.model <- Mo.anox.py.rf.results$model.fit.data
Uranium
For our primary U analyses we include all key geologic context
variables (“site.type”, “metamorphic.bin”, “basin.type”,
“site.latitude”, “site.longitude”, “lithology.name”,
“environmental.bin”) as well as geologic age, TOC and [Al] in our random
forest model. We incorporate TOC to control for the impacts of organic
carbon availability on U reduction rates (note that only anoxic samples
are used in these analyses). [Al] is incorporated to control for
detrital input.
U.anox.rf.results <- Monte.Carlo.rF(data = U.anox.rf,
resp.var = "U..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"U..ppm.",
"TOC..wt..",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
U.anox.rf.partial <- U.anox.rf.results$partial.plot.data
U.anox.rf.import <- U.anox.rf.results$importance.data
U.anox.rf.model <- U.anox.rf.results$model.fit.data
Proportion euxinic
For our primary proportion euxinic analyses we include all key
geologic context variables (“site.type”, “metamorphic.bin”,
“basin.type”, “site.latitude”, “site.longitude”, “lithology.name”,
“environmental.bin”) as well as geologic age and [Al] in our random
forest model. [Al] is incorporated to control for detrital input. Note
that only anoxic samples are used in these analyses.
Note that unless warnings are hidden (as they are in this R Markdown
file) a warning appears asking if we are sure we want to do a regression
because of the binary response variable. In this case we do, following a
similar approach in Sperling et al. 2015 (Nature). For completeness,
this warning would read: “## Warning in randomForest.default(m, y, …):
The response has five or fewer unique values. Are you sure you want to
do regression?” if it was not hidden.
Fepy.anox.rf.results <- Monte.Carlo.rF(data = Fepy.anox.rf,
resp.var = "euxinic.Fe",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"euxinic.Fe",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
Fepy.anox.rf.partial <- Fepy.anox.rf.results$partial.plot.data
Fepy.anox.rf.import <- Fepy.anox.rf.results$importance.data
Fepy.anox.rf.model <- Fepy.anox.rf.results$model.fit.data
Total Organic Carbon
For our primary TOC analyses we include all key geologic context
variables (“site.type”, “metamorphic.bin”, “basin.type”,
“site.latitude”, “site.longitude”, “lithology.name”,
“environmental.bin”) as well as geologic age and [Al] in our random
forest model. [Al] is incorporated to control for detrital input.
TOC.all.rf.results <- Monte.Carlo.rF(data = TOC.all.rf,
resp.var = "TOC..wt..",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"TOC..wt..",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
TOC.all.rf.partial <- TOC.all.rf.results$partial.plot.data
TOC.all.rf.import <- TOC.all.rf.results$importance.data
TOC.all.rf.model <- TOC.all.rf.results$model.fit.data
6. Function to interpolate partial dependence plot lines
To plot envelopes that summarizing all n (100) random forests, we
need to interpolate the lines drawn between points defined in partial
dependence plots. We can then evaluate the distribution of partial
dependence plot values at each timestep.
interp.pdp <- function(data, age.rounding.factor, n){
# data - partial dependence plot dataframe
# age.rounding.factor - number of digits (decimal places) for millions of years
# n - number of iterations
# initiate dataframe to store interpolated PDP values
PDPs.interpolated <- data.frame(Age=double(), Var=double(), Iteration=double())
# Before starting, round age splits to nearest x Myrs (if age.rounding factor = 0, 1 Myrs; if age.rounding factor = 1, 0.1 Myrs)
data$Age.rounded <- round(data$Age, digits=age.rounding.factor)
# loop through n iterations of Monte Carlo random forest analysis
for (It in 1:n){
# create subset for specific iteration
It.subset <- filter(data, Iteration == It)
# interpolate values at each timestep
Interpolated.Age.Var <- approx(x = It.subset$Age.rounded,
y = It.subset$Var,
xout = seq(min(It.subset$Age.rounded, na.rm = T),
max(It.subset$Age.rounded, na.rm = T),
10^-age.rounding.factor),
method = "linear",
ties = mean) %>%
as.data.frame() %>%
setNames(c("Age", "Var")) # only used for visual inspection of this dataframe
# Add interpolated data to summary dataframe
i <- nrow(PDPs.interpolated)+1
j <- nrow(PDPs.interpolated)+nrow(Interpolated.Age.Var)
PDPs.interpolated[i:j , 1:2] <- Interpolated.Age.Var
PDPs.interpolated[i:j , 3] <- It
}
# Summarize interpolated PDPs
# convert age to factor and then convert back otherwise end up with occasional weird duplicates using group_by() on a numerical variable.
Var.sum <- PDPs.interpolated %>%
group_by(as.factor(Age)) %>%
summarize(min = min(Var, na.rm=T),
perc.05 = quantile(Var, probs=0.05, na.rm=T),
perc.25 = quantile(Var, probs=0.25, na.rm=T),
median = median(Var, na.rm=T),
mean = mean(Var, na.rm=T),
perc.75 = quantile(Var, probs=0.75, na.rm=T),
perc.95 = quantile(Var, probs=0.95, na.rm=T),
max = max(Var, na.rm=T)) %>%
as.data.frame()
# rename as.factor(Age) column "Age"
names(Var.sum)[1] <- "Age"
# make Age numeric again
Var.sum$Age <- as.numeric(paste(Var.sum$Age))
return(Var.sum)
}
Test function using primary Mo analyses. Do lines based on the
(sparse) partial dependence plot points generated by the random forest
function sit within the min-max envelope generated by our function? In
this study we use an age rounding factor of 1 (to nearest 0.1Myrs) for
the sake of computational efficiency but the patterns are observed at
rounding factors of 2 (0.01Myrs) and 0 (1Myrs). Note that really coarse
rounding factors (e.g. a factor of -1, or 10Myrs) will not accurately
reproduce the partial dependence plots generated by the random forest
function.
mo.pdp.test <- interp.pdp(data = Mo.anox.py.rf.partial, age.rounding.factor = 1, n = n)
Mo.test.plot <- ggplot()+
geom_ribbon(data = mo.pdp.test, aes(x = Age, ymin = min, ymax = max), alpha=.4, fill="darkred", color="grey70", size=.3)+
geom_path(data = Mo.anox.py.rf.partial, aes(x = Age, y = Var, group = Iteration), alpha = 0.4, size = 0.3, color = "grey10")+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(0,62),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## i Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## i Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Mo.test.plot

All of the individual lines lie within the envelope of minimum and
maximum values defined by our interp.pdp() function.
7. Plotting primary analyses
In these plots we summarize the 100 partial dependence plots run for
each analysis as envelopes. The lighter gray envelope describes the
maximum and minimum values for each timestep and the darker gray
envelope describes the 25th and 75th percentiles.
Molybdenum
mo.pdp <- interp.pdp(data = Mo.anox.py.rf.partial, age.rounding.factor = 1, n = n)
Mo.plot <- ggplot()+
geom_ribbon(data = mo.pdp, aes(x = Age, ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(data = mo.pdp, aes(x = Age, ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(0,62),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
Mo.plot

Uranium
u.pdp <- interp.pdp(data = U.anox.rf.partial, age.rounding.factor = 1, n = n)
U.plot <- ggplot()+
geom_ribbon(data = u.pdp, aes(x = Age, ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(data = u.pdp, aes(x = Age, ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(0,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
ggplot()+
geom_line(data = u.pdp, aes(x = Age, y=min), alpha=1, color="grey70", size=.3)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.01,1.03),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())

U.plot

Proportion euxinic
Note that we hide warnings here because of random forest warning
about using binary variable in a regression (here we do want to do a
regression to replicate the broad approach of Sperling et al. 2015,
Nature - see similar comments above).
prop_eux.pdp <- interp.pdp(data = Fepy.anox.rf.partial, age.rounding.factor = 1, n = n)
prop_eux.plot <- ggplot()+
geom_ribbon(data = prop_eux.pdp, aes(x = Age, ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(data = prop_eux.pdp, aes(x = Age, ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.01,1.03),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
ggplot()+
geom_point(data = prop_eux.pdp, aes(x = Age, y=max))+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.01,1.03),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())

prop_eux.plot

Total Organic Carbon
TOC.pdp <- interp.pdp(data = TOC.all.rf.partial, age.rounding.factor = 1, n = n)
TOC.plot <- ggplot()+
geom_ribbon(data = TOC.pdp, aes(x = Age, ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(data = TOC.pdp, aes(x = Age, ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(0,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt %)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.3,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot

Combine to generate primary summary plot
In these plots we have delineated the three main phases of marine
biogeochemistry between 1000 and 300Ma - with transitions around the
base of of the Cambrian and Devonian periods.
Mo.plot.for.sum <- ggplot(mo.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,62),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.sum <- ggplot(u.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.plot.for.sum <- ggplot(prop_eux.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot.for.sum <- ggplot(TOC.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.plot <- ggarrange2(Mo.plot.for.sum, prop_eux.plot.for.sum, U.plot.for.sum, TOC.plot.for.sum, ncol=2, heights=c(1,1))

ggsave("Figure 2 Partial Dependence Plot with shading 20240207 100 iterations.pdf", summary.plot, height=14, width=26)
save(mo.pdp, u.pdp, prop_eux.pdp, TOC.pdp, file = "primary-rf-analyses-20240207-100-iterations.RData")
8. Expanded analyses varying redox-sensitive predictors
Anoxic Mo (no pyrite)
Here, we generate an anoxic Mo equivalent to our primary U dataset
(not restricting to anoxic samples with iron speciation). Fepy/FeHR is
not included as a predictor variable.
Mo.anox.rf <- trace.toc.full %>%
filter(!is.na(Mo..ppm.)) %>%
filter(FeHR.FeT >= 0.38 | FeT.Al >= 0.53)
nrow(Mo.anox.rf)
Mo.anox.rf.results <- Monte.Carlo.rF(data = Mo.anox.rf,
resp.var = "Mo..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"Mo..ppm.",
"TOC..wt..",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
Mo.anox.rf.partial <- Mo.anox.rf.results$partial.plot.data
Mo.anox.rf.import <- Mo.anox.rf.results$importance.data
mo.no_py.pdp <- interp.pdp(data = Mo.anox.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary Mo analyses for
plotting.
mo.pdp$treatment <- "Anoxic samples + TOC + Fepy/FeHR"
mo.no_py.pdp$treatment <- "Anoxic samples + TOC"
mo.pdp.sum <- rbind(mo.pdp,
mo.no_py.pdp)
Mo - testing ensitivity to iron speciation thresholds (include
FeHR/FeT and Fepy/FeHR as predictor variables, rather than using as
filter cutoffs)
Mo.thresh.test.rf <- trace.toc.full %>%
filter(!is.na(Mo..ppm.) & !is.na(FeHR.FeT) & !is.na(Fe.py.FeHR))
nrow(Mo.thresh.test.rf)
Mo.thresh.test.rf.results <- Monte.Carlo.rF(data = Mo.thresh.test.rf,
resp.var = "Mo..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"Mo..ppm.",
"TOC..wt..",
"FeHR.FeT",
"Fe.py.FeHR",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
Mo.thresh.test.rf.partial <- Mo.thresh.test.rf.results$partial.plot.data
Mo.thresh.test.rf.import <- Mo.thresh.test.rf.results$importance.data
mo.thresh.test.pdp <- interp.pdp(data = Mo.thresh.test.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary Mo analyses for
plotting.
mo.pdp$treatment <- "Fe-speciation filter approach"
mo.thresh.test.pdp$treatment <- "Fe-speciation predictor approach"
mo.pdp.sum.thresh <- rbind(mo.pdp,
mo.thresh.test.pdp)
Anoxic U with pyrite
Here, we generate an anoxic U equivalent to our primary Mo dataset
(restricting to anoxic samples with iron speciation). Fepy/FeHR is
included as a predictor variable.
U.anox.py.rf <- trace.toc.full %>%
filter(!is.na(U..ppm.) & !is.na(Fe.py.FeHR)) %>%
filter(FeHR.FeT >= 0.38)
nrow(U.anox.py.rf)
U.anox.py.rf.results <- Monte.Carlo.rF(data = U.anox.py.rf,
resp.var = "U..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"U..ppm.",
"TOC..wt..",
"Fe.py.FeHR",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
U.anox.py.rf.partial <- U.anox.py.rf.results$partial.plot.data
U.anox.py.rf.import <- U.anox.py.rf.results$importance.data
u.w_py.pdp <- interp.pdp(data = U.anox.py.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary U analyses for
plotting.
u.pdp$treatment <- "Anoxic samples + TOC"
u.w_py.pdp$treatment <- "Anoxic samples + TOC + Fepy/FeHR"
u.pdp.sum <- rbind(u.pdp,
u.w_py.pdp)
U - testing ensitivity to iron speciation thresholds (include
FeHR/FeT as predictor variable, rather than using as filter
cutoffs)
U.thresh.test.rf <- trace.toc.full %>%
filter(!is.na(U..ppm.) & !is.na(FeHR.FeT))
nrow(U.thresh.test.rf)
U.thresh.test.rf.results <- Monte.Carlo.rF(data = U.thresh.test.rf,
resp.var = "U..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"U..ppm.",
"TOC..wt..",
"FeHR.FeT",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
U.thresh.test.rf.partial <- U.thresh.test.rf.results$partial.plot.data
U.thresh.test.rf.import <- U.thresh.test.rf.results$importance.data
u.thresh.test.pdp <- interp.pdp(data = U.thresh.test.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary U analyses for
plotting.
u.pdp$treatment <- "Fe-speciation filter approach"
u.thresh.test.pdp$treatment <- "Fe-speciation predictor approach"
u.pdp.sum.thresh <- rbind(u.pdp,
u.thresh.test.pdp)
Proportion euxinic with TOC
Note that we hide warnings here because of random forest warning
about using binary variable in a regression (here we do want to do a
regression to replicate the broad approach of Sperling et al. 2015,
Nature).
Fepy.anox.w_TOC.rf <- trace.toc.full %>%
filter(!is.na(Fe.py.FeHR) & !is.na(TOC..wt..)) %>%
filter(FeHR.FeT >= 0.38)
nrow(Fepy.anox.w_TOC.rf)
# For the analysis of the proportion of euxinic samples, we also need to code samples based # upon whether they are euxinic (based on iron speciation) in a binary fashion.
Fepy.anox.w_TOC.rf$euxinic.Fe[Fepy.anox.w_TOC.rf$Fe.py.FeHR >= 0.7] <- 1
Fepy.anox.w_TOC.rf$euxinic.Fe[Fepy.anox.w_TOC.rf$Fe.py.FeHR < 0.7] <- 0
Fepy.anox.w_TOC.rf.results <- Monte.Carlo.rF(data = Fepy.anox.w_TOC.rf,
resp.var = "euxinic.Fe",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"euxinic.Fe",
"Al..wt..",
"TOC..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
Fepy.anox.w_TOC.rf.partial <- Fepy.anox.w_TOC.rf.results$partial.plot.data
Fepy.anox.w_TOC.rf.import <- Fepy.anox.w_TOC.rf.results$importance.data
prop_eux.w_TOC.pdp <- interp.pdp(data = Fepy.anox.w_TOC.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary proportion euxinic
analyses for plotting.
prop_eux.pdp$treatment <- "Anoxic samples"
prop_eux.w_TOC.pdp$treatment <- "Anoxic samples + TOC"
prop_eux.pdp.sum <- rbind(prop_eux.pdp,
prop_eux.w_TOC.pdp)
Anoxic TOC (no pyrite)
TOC.anox.rf <- trace.toc.full %>%
filter(!is.na(TOC..wt..)) %>%
filter(FeHR.FeT >= 0.38 | FeT.Al >= 0.53)
nrow(TOC.anox.rf)
TOC.anox.rf.results <- Monte.Carlo.rF(data = TOC.anox.rf,
resp.var = "TOC..wt..",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"TOC..wt..",
"Al..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
TOC.anox.rf.partial <- TOC.anox.rf.results$partial.plot.data
TOC.anox.rf.import <- TOC.anox.rf.results$importance.data
TOC.anox.pdp <- interp.pdp(data = TOC.anox.rf.partial, age.rounding.factor = 1, n = n)
Anoxic TOC with pyrite
TOC.anox.py.rf <- trace.toc.full %>%
filter(!is.na(TOC..wt..) & !is.na(Fe.py.FeHR)) %>%
filter(FeHR.FeT >= 0.38)
nrow(TOC.anox.rf)
TOC.anox.py.rf.results <- Monte.Carlo.rF(data = TOC.anox.py.rf,
resp.var = "TOC..wt..",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"TOC..wt..",
"Al..wt..",
"Fe.py.FeHR"),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = TRUE)
TOC.anox.py.rf.partial <- TOC.anox.py.rf.results$partial.plot.data
TOC.anox.py.rf.import <- TOC.anox.py.rf.results$importance.data
TOC.anox.py.pdp <- interp.pdp(data = TOC.anox.py.rf.partial, age.rounding.factor = 1, n = n)
We combine these analyses with our primary TOC analyses for
plotting.
TOC.pdp$treatment <- "All samples"
TOC.anox.pdp$treatment <- "Anoxic samples"
TOC.anox.py.pdp$treatment <- "Anoxic samples + Fepy/FeHR"
TOC.pdp.sum <- rbind(TOC.pdp,
TOC.anox.pdp,
TOC.anox.py.pdp)
Summary plotting of analyses including varying redox treatments
In these summary plots we just plot the interquartile ranges for our
analyses to ease the comparison of trends in central tendancy.
Mo.plot.for.redox.var.sum <- ggplot(mo.pdp.sum, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,82),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,254,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.redox.var.sum <- ggplot(u.pdp.sum, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(103, 165, 153, maxColorValue = 255),
rgb(191, 208, 186, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(103, 165, 153, maxColorValue = 255),
rgb(191, 208, 186, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.plot.for.redox.var.sum <- ggplot(prop_eux.pdp.sum, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(240, 64, 40, maxColorValue = 255),
rgb(251, 154, 133, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(240, 64, 40, maxColorValue = 255),
rgb(251, 154, 133, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot.for.redox.var.sum <- ggplot(TOC.pdp.sum, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(203, 140, 55, maxColorValue = 255),
rgb(254, 179, 66, maxColorValue = 255),
rgb(254, 217, 106, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(203, 140, 55, maxColorValue = 255),
rgb(254, 179, 66, maxColorValue = 255),
rgb(254, 217, 106, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.redox.var.plot <- ggarrange2(Mo.plot.for.redox.var.sum, prop_eux.plot.for.redox.var.sum, U.plot.for.redox.var.sum, TOC.plot.for.redox.var.sum, ncol=2, heights=c(1,1))

ggsave("Figure Sx Partial Dependence Plot (varying redox treatments) 20240207 100 iterations.pdf", summary.redox.var.plot, height=16, width=26)
Also compile Fe spectiation threshold vs predictor test plots.
Mo.plot.for.thresh.sum <- ggplot(mo.pdp.sum.thresh, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,82),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,254,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.thresh.sum <- ggplot(u.pdp.sum.thresh, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(103, 165, 153, maxColorValue = 255),
rgb(191, 208, 186, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(103, 165, 153, maxColorValue = 255),
rgb(191, 208, 186, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
Mo.plot.for.thresh.solo <- ggplot(mo.pdp.sum.thresh, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75, fill=treatment, color=treatment), alpha=.7, size=.5)+
scale_fill_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
scale_color_manual(values = c(rgb(127, 160, 86, maxColorValue = 255),
rgb(179, 224, 149, maxColorValue = 255)),
name = "Treatment")+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=26),
legend.text = element_text( size=22),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.thresh.plot <- ggarrange2(Mo.plot.for.thresh.sum, U.plot.for.thresh.sum, ncol=1, heights=c(1,1))
ggsave("Figure Sx Partial Dependence Plot (testing Fe-speciation thresholds) 20240207 100 iterations.pdf", summary.thresh.plot, height=15, width=13)
ggsave("Figure Sx Partial Dependence Plot (testing Fe-speciation thresholds) 20240207 100 iterations Mo only.pdf", Mo.plot.for.thresh.solo, height=8.6, width=13)
9. Primary analyses without Al
We also re-run our analyses without incorporating [Al] as a broad
proxy for detrital input.
Molybdenum
Mo.anox.py.no_Al.rf.results <- Monte.Carlo.rF(data = Mo.anox.py.rf,
resp.var = "Mo..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"Mo..ppm.",
"TOC..wt..",
"Fe.py.FeHR"),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = FALSE)
Mo.anox.py.no_Al.rf.partial <- Mo.anox.py.no_Al.rf.results$partial.plot.data
Mo.anox.py.no_Al.rf.import <- Mo.anox.py.no_Al.rf.results$importance.data
mo.no_Al.pdp <- interp.pdp(data = Mo.anox.py.no_Al.rf.partial, age.rounding.factor = 1, n = n)
Uranium
U.anox.no_Al.rf.results <- Monte.Carlo.rF(data = U.anox.rf,
resp.var = "U..ppm.",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"U..ppm.",
"TOC..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = FALSE)
U.anox.no_Al.rf.partial <- U.anox.no_Al.rf.results$partial.plot.data
U.anox.no_Al.rf.import <- U.anox.no_Al.rf.results$importance.data
u.no_Al.pdp <- interp.pdp(data = U.anox.no_Al.rf.partial, age.rounding.factor = 1, n = n)
Proportion euxinic
Note that we hide warnings here because of random forest warning
about using binary variable in a regression (here we do want to do a
regression to replicate the broad approach of Sperling et al. 2015,
Nature).
Fepy.anox.no_Al.rf.results <- Monte.Carlo.rF(data = Fepy.anox.rf,
resp.var = "euxinic.Fe",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"euxinic.Fe"),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = FALSE)
Fepy.anox.no_Al.rf.partial <- Fepy.anox.no_Al.rf.results$partial.plot.data
Fepy.anox.no_Al.rf.import <- Fepy.anox.no_Al.rf.results$importance.data
prop_eux.no_Al.pdp <- interp.pdp(data = Fepy.anox.no_Al.rf.partial, age.rounding.factor = 1, n = n)
Total Organic Carbon
TOC.all.no_Al.rf.results <- Monte.Carlo.rF(data = TOC.all.rf,
resp.var = "TOC..wt..",
vars = c("age.model",
"site.type",
"metamorphic.bin",
"basin.type",
"site.latitude",
"site.longitude",
"lithology.name",
"environmental.bin",
"TOC..wt.."),
n = n,
run.age.unc = TRUE,
run.partial.context = TRUE,
run.Al.impute = FALSE)
TOC.all.no_Al.rf.partial <- TOC.all.no_Al.rf.results$partial.plot.data
TOC.all.no_Al.rf.import <- TOC.all.no_Al.rf.results$importance.data
TOC.no_Al.pdp <- interp.pdp(data = TOC.all.no_Al.rf.partial, age.rounding.factor = 1, n = n)
Combine to generate summary plot
Mo.no_Al.plot.for.sum <- ggplot(mo.no_Al.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,62),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.no_Al.plot.for.sum <- ggplot(u.no_Al.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.no_Al.plot.for.sum <- ggplot(prop_eux.no_Al.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.no_Al.plot.for.sum <- ggplot(TOC.no_Al.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.plot <- ggarrange2(Mo.no_Al.plot.for.sum, prop_eux.no_Al.plot.for.sum, U.no_Al.plot.for.sum, TOC.no_Al.plot.for.sum, ncol=2, heights=c(1,1))

ggsave("Figure Sx Partial Dependence Plot (no Al) with shading 20240207 100 iterations.pdf", summary.plot, height=14, width=26)
save(mo.pdp.sum, u.pdp.sum, prop_eux.pdp.sum, TOC.pdp.sum, mo.pdp.sum.thresh, mo.no_Al.pdp, u.no_Al.pdp, prop_eux.no_Al.pdp, TOC.no_Al.pdp, file = "supplementary-rf-analyses-20240207-100-iterations.RData")
10. Variable importance plots
We will summarize the variable importance of the predictor variables
in our primary Monte Carlo random forest analyses, using box and whisker
plots to illustrate the distributions generated by our Monte Carlo
approach.
Mo.MSE.plot <- ggplot(Mo.anox.py.rf.import, aes(x = Variable, y = IncMSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased Mean\nSquared Error (%)")+
ggtitle("Molybdenum")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
Mo.Node.plot <- ggplot(Mo.anox.py.rf.import, aes(x = Variable, y = IncNodePurity))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased\nNode Purity")+
ggtitle("Molybdenum")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.MSE.plot <- ggplot(U.anox.rf.import, aes(x = Variable, y = IncMSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased Mean\nSquared Error (%)")+
ggtitle("Uranium")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.Node.plot <- ggplot(U.anox.rf.import, aes(x = Variable, y = IncNodePurity))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased\nNode Purity")+
ggtitle("Uranium")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.MSE.plot <- ggplot(Fepy.anox.rf.import, aes(x = Variable, y = IncMSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased Mean\nSquared Error (%)")+
ggtitle("Proportion Euxinic")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.Node.plot <- ggplot(Fepy.anox.rf.import, aes(x = Variable, y = IncNodePurity))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased\nNode Purity")+
ggtitle("Proportion Euxinic")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.MSE.plot <- ggplot(TOC.all.rf.import, aes(x = Variable, y = IncMSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased Mean\nSquared Error (%)")+
ggtitle("Total Organic Carbon")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.Node.plot <- ggplot(TOC.all.rf.import, aes(x = Variable, y = IncNodePurity))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+coord_flip()+
theme_bw()+
ylab("Increased\nNode Purity")+
ggtitle("Total Organic Carbon")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
Imp.sum <- ggarrange2(Mo.MSE.plot,
Mo.Node.plot,
U.MSE.plot,
U.Node.plot,
prop_eux.MSE.plot,
prop_eux.Node.plot,
TOC.MSE.plot,
TOC.Node.plot,
ncol=2)

ggsave("Figure Sx Variable importance plots for Monte Carlo random forest 20240207 100 iterations.pdf", Imp.sum, height=27, width=17)
10. Model fit plots
We will summarize both model fit and the cross validated random
forest variables used in our primary Monte Carlo random forest analyses,
using box and whisker plots to illustrate the distributions generated by
our Monte Carlo approach.
First, make grouped dataframe for all model fit.
Mo.anox.py.rf.model$proxy <- "Mo"
U.anox.rf.model$proxy <- "U"
Fepy.anox.rf.model$proxy <- "Fepy"
TOC.all.rf.model$proxy <- "TOC"
model.sum <- rbind(Mo.anox.py.rf.model,
U.anox.rf.model,
Fepy.anox.rf.model,
TOC.all.rf.model
)
# for just the MSE plots, separate ppm and wt (%) proxies
ppm.model.sum <- rbind(Mo.anox.py.rf.model,
U.anox.rf.model
)
perc.model.sum <- rbind(Fepy.anox.rf.model,
TOC.all.rf.model
)
Then, we make a summary figure…
#names(model.fit.data) <- c("best.nodesize", "best.mtry", "best.ntree", "best.MSE", "best.var")
MSE.plot.ppm <- ggplot(ppm.model.sum, aes(x = proxy, y = best.MSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Mean Squared Error")+
ggtitle("Mean Squared Error")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
MSE.plot.TOC <- ggplot(TOC.all.rf.model, aes(x = proxy, y = best.MSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Mean Squared Error")+
ggtitle("Mean Squared Error")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
MSE.plot.Fepy <- ggplot(Fepy.anox.rf.model, aes(x = proxy, y = best.MSE))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Mean Squared Error")+
ggtitle("Mean Squared Error")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
var.plot <- ggplot(model.sum, aes(x = proxy, y = best.var))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Variance (%)")+
ggtitle("Variance (%)")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
ntree.plot <- ggplot(model.sum, aes(x = proxy, y = best.ntree))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Number of Trees")+
ggtitle("Number of Trees")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
mtry.plot <- ggplot(model.sum, aes(x = proxy, y = best.mtry))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("mtry (features at each split)")+
ggtitle("mtry (features at each split)")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
nodesize.plot <- ggplot(model.sum, aes(x = proxy, y = best.nodesize))+stat_boxplot(geom ='errorbar', width = 0.6, size=0.6, color="grey20", lwd=.1)+ geom_boxplot(outlier.alpha = .5, outlier.shape=16, outlier.size=2, outlier.color="grey20", color="grey20", lwd=0.6, fill="grey80", fatten = 1.3)+
theme_bw()+
ylab("Node Size")+
ggtitle("Node Size")+
theme(panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1.1),
axis.title = element_text(size=30),
axis.text = element_text(size=24, colour="black"),
plot.title = element_text(size=30, face = "bold"),
plot.margin = ggplot2::margin(10,30,10,10),
legend.position="none",
#axis.title.y = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
model.sum <- ggarrange2(MSE.plot.ppm,
MSE.plot.TOC,
MSE.plot.Fepy,
var.plot,
ntree.plot,
mtry.plot,
nodesize.plot,
ncol=4)

ggsave("Figure Sx Model fit and parameter plots for Monte Carlo random forest 20240207 100 iterations.pdf", model.sum, height=15, width=32)
11. Appendix - Three phase plot evolution for talks
No shading.
Mo.plot.for.sum.no.shade <- ggplot(mo.pdp, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,62),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.sum.no.shade <- ggplot(u.pdp, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.plot.for.sum.no.shade <- ggplot(prop_eux.pdp, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot.for.sum.no.shade <- ggplot(TOC.pdp, aes(x=Age))+
#annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.plot.no.shade <- ggarrange2(Mo.plot.for.sum.no.shade, prop_eux.plot.for.sum.no.shade, U.plot.for.sum.no.shade, TOC.plot.for.sum.no.shade, ncol=2, heights=c(1,1))

ggsave("Figure 2 Partial Dependence Plot no shading 20240207 100 iterations.pdf", summary.plot.no.shade, height=14, width=26)
Neoproterozoic shading.
Mo.plot.for.sum.shade.1 <- ggplot(mo.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,62),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.sum.shade.1 <- ggplot(u.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.plot.for.sum.shade.1 <- ggplot(prop_eux.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot.for.sum.shade.1 <- ggplot(TOC.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
#annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.plot.shade.1<- ggarrange2(Mo.plot.for.sum.shade.1, prop_eux.plot.for.sum.shade.1, U.plot.for.sum.shade.1, TOC.plot.for.sum.shade.1, ncol=2, heights=c(1,1))

ggsave("Figure 2 Partial Dependence Plot with shading part 1 20240207 100 iterations.pdf", summary.plot.shade.1, height=14, width=26)
Neoproterozoic and early Paleozoic shading.
Mo.plot.for.sum.shade.2 <- ggplot(mo.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.4,62),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Mo (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
U.plot.for.sum.shade.2 <- ggplot(u.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-.4,82),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("U (ppm)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
prop_eux.plot.for.sum.shade.2 <- ggplot(prop_eux.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_cartesian(xlim=rev(c(298.9,1000)), ylim=c(-.01,1.03),expand=FALSE)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("Proportion euxinic")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top", legend.justification = c(0, 0),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
TOC.plot.for.sum.shade.2 <- ggplot(TOC.pdp, aes(x=Age))+
annotate(geom="rect", xmax=Inf, xmin=541, ymin=-Inf, ymax=Inf, fill="#C9DEE8", alpha=0.2)+
annotate(geom="rect", xmax=541, xmin=418, ymin=-Inf, ymax=Inf, fill="#9DC2D6", alpha=0.4)+
#annotate(geom="rect", xmax=418, xmin=-Inf, ymin=-Inf, ymax=Inf, fill="#619FBB", alpha=0.4)+
geom_ribbon(aes(ymin = min, ymax = max), alpha=1, fill="grey85", color="grey70", size=.3)+
geom_ribbon(aes(ymin = perc.25, ymax = perc.75), alpha=1, fill="grey60", color="grey50", size=.5)+
theme_bw()+
coord_geo(xlim=rev(c(298.9,1002)), expand=FALSE, ylim=c(-0.04,8.3),
pos = as.list(rep("bottom", 1)),
abbrv=list(TRUE),
dat = list(periods.edit),
height = list(unit(2, "lines")),
bord=list(c("left", "bottom", "right")), lwd=as.list(c(1)), size=8)+
scale_x_reverse(breaks=c(300,400,500,600,700,800,900,1000))+
ylab("TOC (wt%)")+xlab("Time (Ma)")+
theme(plot.margin = ggplot2::margin(.1,1,.3,1,"cm"),panel.border = element_rect(fill=NA,color="black", size=2,linetype="solid"),
axis.ticks = element_line(size=1),
axis.line = element_line(lineend = 'square'),
axis.title = element_text(size=34),
axis.text = element_text( size=26, color="black"),
legend.title = element_text(size=20),
legend.text = element_text( size=16),
axis.ticks.length = unit(5, "points"),
legend.position="top",
legend.justification = c(0, 0),
axis.title.y = element_text(vjust=3),
panel.grid.major = element_blank(),panel.grid.minor = element_blank())
summary.plot.shade.2 <- ggarrange2(Mo.plot.for.sum.shade.2, prop_eux.plot.for.sum.shade.2, U.plot.for.sum.shade.2, TOC.plot.for.sum.shade.2, ncol=2, heights=c(1,1))
